Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
[pandora-kernel.git] / fs / btrfs / relocation.c
1 /*
2  * Copyright (C) 2009 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include "ctree.h"
25 #include "disk-io.h"
26 #include "transaction.h"
27 #include "volumes.h"
28 #include "locking.h"
29 #include "btrfs_inode.h"
30 #include "async-thread.h"
31
32 /*
33  * backref_node, mapping_node and tree_block start with this
34  */
35 struct tree_entry {
36         struct rb_node rb_node;
37         u64 bytenr;
38 };
39
40 /*
41  * present a tree block in the backref cache
42  */
43 struct backref_node {
44         struct rb_node rb_node;
45         u64 bytenr;
46         /* objectid tree block owner */
47         u64 owner;
48         /* list of upper level blocks reference this block */
49         struct list_head upper;
50         /* list of child blocks in the cache */
51         struct list_head lower;
52         /* NULL if this node is not tree root */
53         struct btrfs_root *root;
54         /* extent buffer got by COW the block */
55         struct extent_buffer *eb;
56         /* level of tree block */
57         unsigned int level:8;
58         /* 1 if the block is root of old snapshot */
59         unsigned int old_root:1;
60         /* 1 if no child blocks in the cache */
61         unsigned int lowest:1;
62         /* is the extent buffer locked */
63         unsigned int locked:1;
64         /* has the block been processed */
65         unsigned int processed:1;
66         /* have backrefs of this block been checked */
67         unsigned int checked:1;
68 };
69
70 /*
71  * present a block pointer in the backref cache
72  */
73 struct backref_edge {
74         struct list_head list[2];
75         struct backref_node *node[2];
76         u64 blockptr;
77 };
78
79 #define LOWER   0
80 #define UPPER   1
81
82 struct backref_cache {
83         /* red black tree of all backref nodes in the cache */
84         struct rb_root rb_root;
85         /* list of backref nodes with no child block in the cache */
86         struct list_head pending[BTRFS_MAX_LEVEL];
87         spinlock_t lock;
88 };
89
90 /*
91  * map address of tree root to tree
92  */
93 struct mapping_node {
94         struct rb_node rb_node;
95         u64 bytenr;
96         void *data;
97 };
98
99 struct mapping_tree {
100         struct rb_root rb_root;
101         spinlock_t lock;
102 };
103
104 /*
105  * present a tree block to process
106  */
107 struct tree_block {
108         struct rb_node rb_node;
109         u64 bytenr;
110         struct btrfs_key key;
111         unsigned int level:8;
112         unsigned int key_ready:1;
113 };
114
115 /* inode vector */
116 #define INODEVEC_SIZE 16
117
118 struct inodevec {
119         struct list_head list;
120         struct inode *inode[INODEVEC_SIZE];
121         int nr;
122 };
123
124 #define MAX_EXTENTS 128
125
126 struct file_extent_cluster {
127         u64 start;
128         u64 end;
129         u64 boundary[MAX_EXTENTS];
130         unsigned int nr;
131 };
132
133 struct reloc_control {
134         /* block group to relocate */
135         struct btrfs_block_group_cache *block_group;
136         /* extent tree */
137         struct btrfs_root *extent_root;
138         /* inode for moving data */
139         struct inode *data_inode;
140         struct btrfs_workers workers;
141         /* tree blocks have been processed */
142         struct extent_io_tree processed_blocks;
143         /* map start of tree root to corresponding reloc tree */
144         struct mapping_tree reloc_root_tree;
145         /* list of reloc trees */
146         struct list_head reloc_roots;
147         u64 search_start;
148         u64 extents_found;
149         u64 extents_skipped;
150         int stage;
151         int create_reloc_root;
152         unsigned int found_file_extent:1;
153         unsigned int found_old_snapshot:1;
154 };
155
156 /* stages of data relocation */
157 #define MOVE_DATA_EXTENTS       0
158 #define UPDATE_DATA_PTRS        1
159
160 /*
161  * merge reloc tree to corresponding fs tree in worker threads
162  */
163 struct async_merge {
164         struct btrfs_work work;
165         struct reloc_control *rc;
166         struct btrfs_root *root;
167         struct completion *done;
168         atomic_t *num_pending;
169 };
170
171 static void mapping_tree_init(struct mapping_tree *tree)
172 {
173         tree->rb_root = RB_ROOT;
174         spin_lock_init(&tree->lock);
175 }
176
177 static void backref_cache_init(struct backref_cache *cache)
178 {
179         int i;
180         cache->rb_root = RB_ROOT;
181         for (i = 0; i < BTRFS_MAX_LEVEL; i++)
182                 INIT_LIST_HEAD(&cache->pending[i]);
183         spin_lock_init(&cache->lock);
184 }
185
186 static void backref_node_init(struct backref_node *node)
187 {
188         memset(node, 0, sizeof(*node));
189         INIT_LIST_HEAD(&node->upper);
190         INIT_LIST_HEAD(&node->lower);
191         RB_CLEAR_NODE(&node->rb_node);
192 }
193
194 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
195                                    struct rb_node *node)
196 {
197         struct rb_node **p = &root->rb_node;
198         struct rb_node *parent = NULL;
199         struct tree_entry *entry;
200
201         while (*p) {
202                 parent = *p;
203                 entry = rb_entry(parent, struct tree_entry, rb_node);
204
205                 if (bytenr < entry->bytenr)
206                         p = &(*p)->rb_left;
207                 else if (bytenr > entry->bytenr)
208                         p = &(*p)->rb_right;
209                 else
210                         return parent;
211         }
212
213         rb_link_node(node, parent, p);
214         rb_insert_color(node, root);
215         return NULL;
216 }
217
218 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
219 {
220         struct rb_node *n = root->rb_node;
221         struct tree_entry *entry;
222
223         while (n) {
224                 entry = rb_entry(n, struct tree_entry, rb_node);
225
226                 if (bytenr < entry->bytenr)
227                         n = n->rb_left;
228                 else if (bytenr > entry->bytenr)
229                         n = n->rb_right;
230                 else
231                         return n;
232         }
233         return NULL;
234 }
235
236 /*
237  * walk up backref nodes until reach node presents tree root
238  */
239 static struct backref_node *walk_up_backref(struct backref_node *node,
240                                             struct backref_edge *edges[],
241                                             int *index)
242 {
243         struct backref_edge *edge;
244         int idx = *index;
245
246         while (!list_empty(&node->upper)) {
247                 edge = list_entry(node->upper.next,
248                                   struct backref_edge, list[LOWER]);
249                 edges[idx++] = edge;
250                 node = edge->node[UPPER];
251         }
252         *index = idx;
253         return node;
254 }
255
256 /*
257  * walk down backref nodes to find start of next reference path
258  */
259 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
260                                               int *index)
261 {
262         struct backref_edge *edge;
263         struct backref_node *lower;
264         int idx = *index;
265
266         while (idx > 0) {
267                 edge = edges[idx - 1];
268                 lower = edge->node[LOWER];
269                 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
270                         idx--;
271                         continue;
272                 }
273                 edge = list_entry(edge->list[LOWER].next,
274                                   struct backref_edge, list[LOWER]);
275                 edges[idx - 1] = edge;
276                 *index = idx;
277                 return edge->node[UPPER];
278         }
279         *index = 0;
280         return NULL;
281 }
282
283 static void drop_node_buffer(struct backref_node *node)
284 {
285         if (node->eb) {
286                 if (node->locked) {
287                         btrfs_tree_unlock(node->eb);
288                         node->locked = 0;
289                 }
290                 free_extent_buffer(node->eb);
291                 node->eb = NULL;
292         }
293 }
294
295 static void drop_backref_node(struct backref_cache *tree,
296                               struct backref_node *node)
297 {
298         BUG_ON(!node->lowest);
299         BUG_ON(!list_empty(&node->upper));
300
301         drop_node_buffer(node);
302         list_del(&node->lower);
303
304         rb_erase(&node->rb_node, &tree->rb_root);
305         kfree(node);
306 }
307
308 /*
309  * remove a backref node from the backref cache
310  */
311 static void remove_backref_node(struct backref_cache *cache,
312                                 struct backref_node *node)
313 {
314         struct backref_node *upper;
315         struct backref_edge *edge;
316
317         if (!node)
318                 return;
319
320         BUG_ON(!node->lowest);
321         while (!list_empty(&node->upper)) {
322                 edge = list_entry(node->upper.next, struct backref_edge,
323                                   list[LOWER]);
324                 upper = edge->node[UPPER];
325                 list_del(&edge->list[LOWER]);
326                 list_del(&edge->list[UPPER]);
327                 kfree(edge);
328                 /*
329                  * add the node to pending list if no other
330                  * child block cached.
331                  */
332                 if (list_empty(&upper->lower)) {
333                         list_add_tail(&upper->lower,
334                                       &cache->pending[upper->level]);
335                         upper->lowest = 1;
336                 }
337         }
338         drop_backref_node(cache, node);
339 }
340
341 /*
342  * find reloc tree by address of tree root
343  */
344 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
345                                           u64 bytenr)
346 {
347         struct rb_node *rb_node;
348         struct mapping_node *node;
349         struct btrfs_root *root = NULL;
350
351         spin_lock(&rc->reloc_root_tree.lock);
352         rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
353         if (rb_node) {
354                 node = rb_entry(rb_node, struct mapping_node, rb_node);
355                 root = (struct btrfs_root *)node->data;
356         }
357         spin_unlock(&rc->reloc_root_tree.lock);
358         return root;
359 }
360
361 static int is_cowonly_root(u64 root_objectid)
362 {
363         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
364             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
365             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
366             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
367             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
368             root_objectid == BTRFS_CSUM_TREE_OBJECTID)
369                 return 1;
370         return 0;
371 }
372
373 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
374                                         u64 root_objectid)
375 {
376         struct btrfs_key key;
377
378         key.objectid = root_objectid;
379         key.type = BTRFS_ROOT_ITEM_KEY;
380         if (is_cowonly_root(root_objectid))
381                 key.offset = 0;
382         else
383                 key.offset = (u64)-1;
384
385         return btrfs_read_fs_root_no_name(fs_info, &key);
386 }
387
388 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
389 static noinline_for_stack
390 struct btrfs_root *find_tree_root(struct reloc_control *rc,
391                                   struct extent_buffer *leaf,
392                                   struct btrfs_extent_ref_v0 *ref0)
393 {
394         struct btrfs_root *root;
395         u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
396         u64 generation = btrfs_ref_generation_v0(leaf, ref0);
397
398         BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
399
400         root = read_fs_root(rc->extent_root->fs_info, root_objectid);
401         BUG_ON(IS_ERR(root));
402
403         if (root->ref_cows &&
404             generation != btrfs_root_generation(&root->root_item))
405                 return NULL;
406
407         return root;
408 }
409 #endif
410
411 static noinline_for_stack
412 int find_inline_backref(struct extent_buffer *leaf, int slot,
413                         unsigned long *ptr, unsigned long *end)
414 {
415         struct btrfs_extent_item *ei;
416         struct btrfs_tree_block_info *bi;
417         u32 item_size;
418
419         item_size = btrfs_item_size_nr(leaf, slot);
420 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
421         if (item_size < sizeof(*ei)) {
422                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
423                 return 1;
424         }
425 #endif
426         ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
427         WARN_ON(!(btrfs_extent_flags(leaf, ei) &
428                   BTRFS_EXTENT_FLAG_TREE_BLOCK));
429
430         if (item_size <= sizeof(*ei) + sizeof(*bi)) {
431                 WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
432                 return 1;
433         }
434
435         bi = (struct btrfs_tree_block_info *)(ei + 1);
436         *ptr = (unsigned long)(bi + 1);
437         *end = (unsigned long)ei + item_size;
438         return 0;
439 }
440
441 /*
442  * build backref tree for a given tree block. root of the backref tree
443  * corresponds the tree block, leaves of the backref tree correspond
444  * roots of b-trees that reference the tree block.
445  *
446  * the basic idea of this function is check backrefs of a given block
447  * to find upper level blocks that refernece the block, and then check
448  * bakcrefs of these upper level blocks recursively. the recursion stop
449  * when tree root is reached or backrefs for the block is cached.
450  *
451  * NOTE: if we find backrefs for a block are cached, we know backrefs
452  * for all upper level blocks that directly/indirectly reference the
453  * block are also cached.
454  */
455 static struct backref_node *build_backref_tree(struct reloc_control *rc,
456                                                struct backref_cache *cache,
457                                                struct btrfs_key *node_key,
458                                                int level, u64 bytenr)
459 {
460         struct btrfs_path *path1;
461         struct btrfs_path *path2;
462         struct extent_buffer *eb;
463         struct btrfs_root *root;
464         struct backref_node *cur;
465         struct backref_node *upper;
466         struct backref_node *lower;
467         struct backref_node *node = NULL;
468         struct backref_node *exist = NULL;
469         struct backref_edge *edge;
470         struct rb_node *rb_node;
471         struct btrfs_key key;
472         unsigned long end;
473         unsigned long ptr;
474         LIST_HEAD(list);
475         int ret;
476         int err = 0;
477
478         path1 = btrfs_alloc_path();
479         path2 = btrfs_alloc_path();
480         if (!path1 || !path2) {
481                 err = -ENOMEM;
482                 goto out;
483         }
484
485         node = kmalloc(sizeof(*node), GFP_NOFS);
486         if (!node) {
487                 err = -ENOMEM;
488                 goto out;
489         }
490
491         backref_node_init(node);
492         node->bytenr = bytenr;
493         node->owner = 0;
494         node->level = level;
495         node->lowest = 1;
496         cur = node;
497 again:
498         end = 0;
499         ptr = 0;
500         key.objectid = cur->bytenr;
501         key.type = BTRFS_EXTENT_ITEM_KEY;
502         key.offset = (u64)-1;
503
504         path1->search_commit_root = 1;
505         path1->skip_locking = 1;
506         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
507                                 0, 0);
508         if (ret < 0) {
509                 err = ret;
510                 goto out;
511         }
512         BUG_ON(!ret || !path1->slots[0]);
513
514         path1->slots[0]--;
515
516         WARN_ON(cur->checked);
517         if (!list_empty(&cur->upper)) {
518                 /*
519                  * the backref was added previously when processsing
520                  * backref of type BTRFS_TREE_BLOCK_REF_KEY
521                  */
522                 BUG_ON(!list_is_singular(&cur->upper));
523                 edge = list_entry(cur->upper.next, struct backref_edge,
524                                   list[LOWER]);
525                 BUG_ON(!list_empty(&edge->list[UPPER]));
526                 exist = edge->node[UPPER];
527                 /*
528                  * add the upper level block to pending list if we need
529                  * check its backrefs
530                  */
531                 if (!exist->checked)
532                         list_add_tail(&edge->list[UPPER], &list);
533         } else {
534                 exist = NULL;
535         }
536
537         while (1) {
538                 cond_resched();
539                 eb = path1->nodes[0];
540
541                 if (ptr >= end) {
542                         if (path1->slots[0] >= btrfs_header_nritems(eb)) {
543                                 ret = btrfs_next_leaf(rc->extent_root, path1);
544                                 if (ret < 0) {
545                                         err = ret;
546                                         goto out;
547                                 }
548                                 if (ret > 0)
549                                         break;
550                                 eb = path1->nodes[0];
551                         }
552
553                         btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
554                         if (key.objectid != cur->bytenr) {
555                                 WARN_ON(exist);
556                                 break;
557                         }
558
559                         if (key.type == BTRFS_EXTENT_ITEM_KEY) {
560                                 ret = find_inline_backref(eb, path1->slots[0],
561                                                           &ptr, &end);
562                                 if (ret)
563                                         goto next;
564                         }
565                 }
566
567                 if (ptr < end) {
568                         /* update key for inline back ref */
569                         struct btrfs_extent_inline_ref *iref;
570                         iref = (struct btrfs_extent_inline_ref *)ptr;
571                         key.type = btrfs_extent_inline_ref_type(eb, iref);
572                         key.offset = btrfs_extent_inline_ref_offset(eb, iref);
573                         WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
574                                 key.type != BTRFS_SHARED_BLOCK_REF_KEY);
575                 }
576
577                 if (exist &&
578                     ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
579                       exist->owner == key.offset) ||
580                      (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
581                       exist->bytenr == key.offset))) {
582                         exist = NULL;
583                         goto next;
584                 }
585
586 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
587                 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
588                     key.type == BTRFS_EXTENT_REF_V0_KEY) {
589                         if (key.objectid == key.offset &&
590                             key.type == BTRFS_EXTENT_REF_V0_KEY) {
591                                 struct btrfs_extent_ref_v0 *ref0;
592                                 ref0 = btrfs_item_ptr(eb, path1->slots[0],
593                                                 struct btrfs_extent_ref_v0);
594                                 root = find_tree_root(rc, eb, ref0);
595                                 if (root)
596                                         cur->root = root;
597                                 else
598                                         cur->old_root = 1;
599                                 break;
600                         }
601 #else
602                 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
603                 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
604 #endif
605                         if (key.objectid == key.offset) {
606                                 /*
607                                  * only root blocks of reloc trees use
608                                  * backref of this type.
609                                  */
610                                 root = find_reloc_root(rc, cur->bytenr);
611                                 BUG_ON(!root);
612                                 cur->root = root;
613                                 break;
614                         }
615
616                         edge = kzalloc(sizeof(*edge), GFP_NOFS);
617                         if (!edge) {
618                                 err = -ENOMEM;
619                                 goto out;
620                         }
621                         rb_node = tree_search(&cache->rb_root, key.offset);
622                         if (!rb_node) {
623                                 upper = kmalloc(sizeof(*upper), GFP_NOFS);
624                                 if (!upper) {
625                                         kfree(edge);
626                                         err = -ENOMEM;
627                                         goto out;
628                                 }
629                                 backref_node_init(upper);
630                                 upper->bytenr = key.offset;
631                                 upper->owner = 0;
632                                 upper->level = cur->level + 1;
633                                 /*
634                                  *  backrefs for the upper level block isn't
635                                  *  cached, add the block to pending list
636                                  */
637                                 list_add_tail(&edge->list[UPPER], &list);
638                         } else {
639                                 upper = rb_entry(rb_node, struct backref_node,
640                                                  rb_node);
641                                 INIT_LIST_HEAD(&edge->list[UPPER]);
642                         }
643                         list_add(&edge->list[LOWER], &cur->upper);
644                         edge->node[UPPER] = upper;
645                         edge->node[LOWER] = cur;
646
647                         goto next;
648                 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
649                         goto next;
650                 }
651
652                 /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
653                 root = read_fs_root(rc->extent_root->fs_info, key.offset);
654                 if (IS_ERR(root)) {
655                         err = PTR_ERR(root);
656                         goto out;
657                 }
658
659                 if (btrfs_root_level(&root->root_item) == cur->level) {
660                         /* tree root */
661                         BUG_ON(btrfs_root_bytenr(&root->root_item) !=
662                                cur->bytenr);
663                         cur->root = root;
664                         break;
665                 }
666
667                 level = cur->level + 1;
668
669                 /*
670                  * searching the tree to find upper level blocks
671                  * reference the block.
672                  */
673                 path2->search_commit_root = 1;
674                 path2->skip_locking = 1;
675                 path2->lowest_level = level;
676                 ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
677                 path2->lowest_level = 0;
678                 if (ret < 0) {
679                         err = ret;
680                         goto out;
681                 }
682                 if (ret > 0 && path2->slots[level] > 0)
683                         path2->slots[level]--;
684
685                 eb = path2->nodes[level];
686                 WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
687                         cur->bytenr);
688
689                 lower = cur;
690                 for (; level < BTRFS_MAX_LEVEL; level++) {
691                         if (!path2->nodes[level]) {
692                                 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
693                                        lower->bytenr);
694                                 lower->root = root;
695                                 break;
696                         }
697
698                         edge = kzalloc(sizeof(*edge), GFP_NOFS);
699                         if (!edge) {
700                                 err = -ENOMEM;
701                                 goto out;
702                         }
703
704                         eb = path2->nodes[level];
705                         rb_node = tree_search(&cache->rb_root, eb->start);
706                         if (!rb_node) {
707                                 upper = kmalloc(sizeof(*upper), GFP_NOFS);
708                                 if (!upper) {
709                                         kfree(edge);
710                                         err = -ENOMEM;
711                                         goto out;
712                                 }
713                                 backref_node_init(upper);
714                                 upper->bytenr = eb->start;
715                                 upper->owner = btrfs_header_owner(eb);
716                                 upper->level = lower->level + 1;
717
718                                 /*
719                                  * if we know the block isn't shared
720                                  * we can void checking its backrefs.
721                                  */
722                                 if (btrfs_block_can_be_shared(root, eb))
723                                         upper->checked = 0;
724                                 else
725                                         upper->checked = 1;
726
727                                 /*
728                                  * add the block to pending list if we
729                                  * need check its backrefs. only block
730                                  * at 'cur->level + 1' is added to the
731                                  * tail of pending list. this guarantees
732                                  * we check backrefs from lower level
733                                  * blocks to upper level blocks.
734                                  */
735                                 if (!upper->checked &&
736                                     level == cur->level + 1) {
737                                         list_add_tail(&edge->list[UPPER],
738                                                       &list);
739                                 } else
740                                         INIT_LIST_HEAD(&edge->list[UPPER]);
741                         } else {
742                                 upper = rb_entry(rb_node, struct backref_node,
743                                                  rb_node);
744                                 BUG_ON(!upper->checked);
745                                 INIT_LIST_HEAD(&edge->list[UPPER]);
746                         }
747                         list_add_tail(&edge->list[LOWER], &lower->upper);
748                         edge->node[UPPER] = upper;
749                         edge->node[LOWER] = lower;
750
751                         if (rb_node)
752                                 break;
753                         lower = upper;
754                         upper = NULL;
755                 }
756                 btrfs_release_path(root, path2);
757 next:
758                 if (ptr < end) {
759                         ptr += btrfs_extent_inline_ref_size(key.type);
760                         if (ptr >= end) {
761                                 WARN_ON(ptr > end);
762                                 ptr = 0;
763                                 end = 0;
764                         }
765                 }
766                 if (ptr >= end)
767                         path1->slots[0]++;
768         }
769         btrfs_release_path(rc->extent_root, path1);
770
771         cur->checked = 1;
772         WARN_ON(exist);
773
774         /* the pending list isn't empty, take the first block to process */
775         if (!list_empty(&list)) {
776                 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
777                 list_del_init(&edge->list[UPPER]);
778                 cur = edge->node[UPPER];
779                 goto again;
780         }
781
782         /*
783          * everything goes well, connect backref nodes and insert backref nodes
784          * into the cache.
785          */
786         BUG_ON(!node->checked);
787         rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
788         BUG_ON(rb_node);
789
790         list_for_each_entry(edge, &node->upper, list[LOWER])
791                 list_add_tail(&edge->list[UPPER], &list);
792
793         while (!list_empty(&list)) {
794                 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
795                 list_del_init(&edge->list[UPPER]);
796                 upper = edge->node[UPPER];
797
798                 if (!RB_EMPTY_NODE(&upper->rb_node)) {
799                         if (upper->lowest) {
800                                 list_del_init(&upper->lower);
801                                 upper->lowest = 0;
802                         }
803
804                         list_add_tail(&edge->list[UPPER], &upper->lower);
805                         continue;
806                 }
807
808                 BUG_ON(!upper->checked);
809                 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
810                                       &upper->rb_node);
811                 BUG_ON(rb_node);
812
813                 list_add_tail(&edge->list[UPPER], &upper->lower);
814
815                 list_for_each_entry(edge, &upper->upper, list[LOWER])
816                         list_add_tail(&edge->list[UPPER], &list);
817         }
818 out:
819         btrfs_free_path(path1);
820         btrfs_free_path(path2);
821         if (err) {
822                 INIT_LIST_HEAD(&list);
823                 upper = node;
824                 while (upper) {
825                         if (RB_EMPTY_NODE(&upper->rb_node)) {
826                                 list_splice_tail(&upper->upper, &list);
827                                 kfree(upper);
828                         }
829
830                         if (list_empty(&list))
831                                 break;
832
833                         edge = list_entry(list.next, struct backref_edge,
834                                           list[LOWER]);
835                         upper = edge->node[UPPER];
836                         kfree(edge);
837                 }
838                 return ERR_PTR(err);
839         }
840         return node;
841 }
842
843 /*
844  * helper to add 'address of tree root -> reloc tree' mapping
845  */
846 static int __add_reloc_root(struct btrfs_root *root)
847 {
848         struct rb_node *rb_node;
849         struct mapping_node *node;
850         struct reloc_control *rc = root->fs_info->reloc_ctl;
851
852         node = kmalloc(sizeof(*node), GFP_NOFS);
853         BUG_ON(!node);
854
855         node->bytenr = root->node->start;
856         node->data = root;
857
858         spin_lock(&rc->reloc_root_tree.lock);
859         rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
860                               node->bytenr, &node->rb_node);
861         spin_unlock(&rc->reloc_root_tree.lock);
862         BUG_ON(rb_node);
863
864         list_add_tail(&root->root_list, &rc->reloc_roots);
865         return 0;
866 }
867
868 /*
869  * helper to update/delete the 'address of tree root -> reloc tree'
870  * mapping
871  */
872 static int __update_reloc_root(struct btrfs_root *root, int del)
873 {
874         struct rb_node *rb_node;
875         struct mapping_node *node = NULL;
876         struct reloc_control *rc = root->fs_info->reloc_ctl;
877
878         spin_lock(&rc->reloc_root_tree.lock);
879         rb_node = tree_search(&rc->reloc_root_tree.rb_root,
880                               root->commit_root->start);
881         if (rb_node) {
882                 node = rb_entry(rb_node, struct mapping_node, rb_node);
883                 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
884         }
885         spin_unlock(&rc->reloc_root_tree.lock);
886
887         BUG_ON((struct btrfs_root *)node->data != root);
888
889         if (!del) {
890                 spin_lock(&rc->reloc_root_tree.lock);
891                 node->bytenr = root->node->start;
892                 rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
893                                       node->bytenr, &node->rb_node);
894                 spin_unlock(&rc->reloc_root_tree.lock);
895                 BUG_ON(rb_node);
896         } else {
897                 list_del_init(&root->root_list);
898                 kfree(node);
899         }
900         return 0;
901 }
902
903 /*
904  * create reloc tree for a given fs tree. reloc tree is just a
905  * snapshot of the fs tree with special root objectid.
906  */
907 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
908                           struct btrfs_root *root)
909 {
910         struct btrfs_root *reloc_root;
911         struct extent_buffer *eb;
912         struct btrfs_root_item *root_item;
913         struct btrfs_key root_key;
914         int ret;
915
916         if (root->reloc_root) {
917                 reloc_root = root->reloc_root;
918                 reloc_root->last_trans = trans->transid;
919                 return 0;
920         }
921
922         if (!root->fs_info->reloc_ctl ||
923             !root->fs_info->reloc_ctl->create_reloc_root ||
924             root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
925                 return 0;
926
927         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
928         BUG_ON(!root_item);
929
930         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
931         root_key.type = BTRFS_ROOT_ITEM_KEY;
932         root_key.offset = root->root_key.objectid;
933
934         ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
935                               BTRFS_TREE_RELOC_OBJECTID);
936         BUG_ON(ret);
937
938         btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
939         memcpy(root_item, &root->root_item, sizeof(*root_item));
940         btrfs_set_root_refs(root_item, 1);
941         btrfs_set_root_bytenr(root_item, eb->start);
942         btrfs_set_root_level(root_item, btrfs_header_level(eb));
943         btrfs_set_root_generation(root_item, trans->transid);
944         memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
945         root_item->drop_level = 0;
946
947         btrfs_tree_unlock(eb);
948         free_extent_buffer(eb);
949
950         ret = btrfs_insert_root(trans, root->fs_info->tree_root,
951                                 &root_key, root_item);
952         BUG_ON(ret);
953         kfree(root_item);
954
955         reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
956                                                  &root_key);
957         BUG_ON(IS_ERR(reloc_root));
958         reloc_root->last_trans = trans->transid;
959
960         __add_reloc_root(reloc_root);
961         root->reloc_root = reloc_root;
962         return 0;
963 }
964
965 /*
966  * update root item of reloc tree
967  */
968 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
969                             struct btrfs_root *root)
970 {
971         struct btrfs_root *reloc_root;
972         struct btrfs_root_item *root_item;
973         int del = 0;
974         int ret;
975
976         if (!root->reloc_root)
977                 return 0;
978
979         reloc_root = root->reloc_root;
980         root_item = &reloc_root->root_item;
981
982         if (btrfs_root_refs(root_item) == 0) {
983                 root->reloc_root = NULL;
984                 del = 1;
985         }
986
987         __update_reloc_root(reloc_root, del);
988
989         if (reloc_root->commit_root != reloc_root->node) {
990                 btrfs_set_root_node(root_item, reloc_root->node);
991                 free_extent_buffer(reloc_root->commit_root);
992                 reloc_root->commit_root = btrfs_root_node(reloc_root);
993         }
994
995         ret = btrfs_update_root(trans, root->fs_info->tree_root,
996                                 &reloc_root->root_key, root_item);
997         BUG_ON(ret);
998         return 0;
999 }
1000
1001 /*
1002  * helper to find first cached inode with inode number >= objectid
1003  * in a subvolume
1004  */
1005 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1006 {
1007         struct rb_node *node;
1008         struct rb_node *prev;
1009         struct btrfs_inode *entry;
1010         struct inode *inode;
1011
1012         spin_lock(&root->inode_lock);
1013 again:
1014         node = root->inode_tree.rb_node;
1015         prev = NULL;
1016         while (node) {
1017                 prev = node;
1018                 entry = rb_entry(node, struct btrfs_inode, rb_node);
1019
1020                 if (objectid < entry->vfs_inode.i_ino)
1021                         node = node->rb_left;
1022                 else if (objectid > entry->vfs_inode.i_ino)
1023                         node = node->rb_right;
1024                 else
1025                         break;
1026         }
1027         if (!node) {
1028                 while (prev) {
1029                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
1030                         if (objectid <= entry->vfs_inode.i_ino) {
1031                                 node = prev;
1032                                 break;
1033                         }
1034                         prev = rb_next(prev);
1035                 }
1036         }
1037         while (node) {
1038                 entry = rb_entry(node, struct btrfs_inode, rb_node);
1039                 inode = igrab(&entry->vfs_inode);
1040                 if (inode) {
1041                         spin_unlock(&root->inode_lock);
1042                         return inode;
1043                 }
1044
1045                 objectid = entry->vfs_inode.i_ino + 1;
1046                 if (cond_resched_lock(&root->inode_lock))
1047                         goto again;
1048
1049                 node = rb_next(node);
1050         }
1051         spin_unlock(&root->inode_lock);
1052         return NULL;
1053 }
1054
1055 static int in_block_group(u64 bytenr,
1056                           struct btrfs_block_group_cache *block_group)
1057 {
1058         if (bytenr >= block_group->key.objectid &&
1059             bytenr < block_group->key.objectid + block_group->key.offset)
1060                 return 1;
1061         return 0;
1062 }
1063
1064 /*
1065  * get new location of data
1066  */
1067 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1068                             u64 bytenr, u64 num_bytes)
1069 {
1070         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1071         struct btrfs_path *path;
1072         struct btrfs_file_extent_item *fi;
1073         struct extent_buffer *leaf;
1074         int ret;
1075
1076         path = btrfs_alloc_path();
1077         if (!path)
1078                 return -ENOMEM;
1079
1080         bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1081         ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
1082                                        bytenr, 0);
1083         if (ret < 0)
1084                 goto out;
1085         if (ret > 0) {
1086                 ret = -ENOENT;
1087                 goto out;
1088         }
1089
1090         leaf = path->nodes[0];
1091         fi = btrfs_item_ptr(leaf, path->slots[0],
1092                             struct btrfs_file_extent_item);
1093
1094         BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1095                btrfs_file_extent_compression(leaf, fi) ||
1096                btrfs_file_extent_encryption(leaf, fi) ||
1097                btrfs_file_extent_other_encoding(leaf, fi));
1098
1099         if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1100                 ret = 1;
1101                 goto out;
1102         }
1103
1104         if (new_bytenr)
1105                 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106         ret = 0;
1107 out:
1108         btrfs_free_path(path);
1109         return ret;
1110 }
1111
1112 /*
1113  * update file extent items in the tree leaf to point to
1114  * the new locations.
1115  */
1116 static int replace_file_extents(struct btrfs_trans_handle *trans,
1117                                 struct reloc_control *rc,
1118                                 struct btrfs_root *root,
1119                                 struct extent_buffer *leaf,
1120                                 struct list_head *inode_list)
1121 {
1122         struct btrfs_key key;
1123         struct btrfs_file_extent_item *fi;
1124         struct inode *inode = NULL;
1125         struct inodevec *ivec = NULL;
1126         u64 parent;
1127         u64 bytenr;
1128         u64 new_bytenr;
1129         u64 num_bytes;
1130         u64 end;
1131         u32 nritems;
1132         u32 i;
1133         int ret;
1134         int first = 1;
1135         int dirty = 0;
1136
1137         if (rc->stage != UPDATE_DATA_PTRS)
1138                 return 0;
1139
1140         /* reloc trees always use full backref */
1141         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1142                 parent = leaf->start;
1143         else
1144                 parent = 0;
1145
1146         nritems = btrfs_header_nritems(leaf);
1147         for (i = 0; i < nritems; i++) {
1148                 cond_resched();
1149                 btrfs_item_key_to_cpu(leaf, &key, i);
1150                 if (key.type != BTRFS_EXTENT_DATA_KEY)
1151                         continue;
1152                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1153                 if (btrfs_file_extent_type(leaf, fi) ==
1154                     BTRFS_FILE_EXTENT_INLINE)
1155                         continue;
1156                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1157                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1158                 if (bytenr == 0)
1159                         continue;
1160                 if (!in_block_group(bytenr, rc->block_group))
1161                         continue;
1162
1163                 /*
1164                  * if we are modifying block in fs tree, wait for readpage
1165                  * to complete and drop the extent cache
1166                  */
1167                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1168                         if (!ivec || ivec->nr == INODEVEC_SIZE) {
1169                                 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1170                                 BUG_ON(!ivec);
1171                                 ivec->nr = 0;
1172                                 list_add_tail(&ivec->list, inode_list);
1173                         }
1174                         if (first) {
1175                                 inode = find_next_inode(root, key.objectid);
1176                                 if (inode)
1177                                         ivec->inode[ivec->nr++] = inode;
1178                                 first = 0;
1179                         } else if (inode && inode->i_ino < key.objectid) {
1180                                 inode = find_next_inode(root, key.objectid);
1181                                 if (inode)
1182                                         ivec->inode[ivec->nr++] = inode;
1183                         }
1184                         if (inode && inode->i_ino == key.objectid) {
1185                                 end = key.offset +
1186                                       btrfs_file_extent_num_bytes(leaf, fi);
1187                                 WARN_ON(!IS_ALIGNED(key.offset,
1188                                                     root->sectorsize));
1189                                 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1190                                 end--;
1191                                 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1192                                                       key.offset, end,
1193                                                       GFP_NOFS);
1194                                 if (!ret)
1195                                         continue;
1196
1197                                 btrfs_drop_extent_cache(inode, key.offset, end,
1198                                                         1);
1199                                 unlock_extent(&BTRFS_I(inode)->io_tree,
1200                                               key.offset, end, GFP_NOFS);
1201                         }
1202                 }
1203
1204                 ret = get_new_location(rc->data_inode, &new_bytenr,
1205                                        bytenr, num_bytes);
1206                 if (ret > 0)
1207                         continue;
1208                 BUG_ON(ret < 0);
1209
1210                 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1211                 dirty = 1;
1212
1213                 key.offset -= btrfs_file_extent_offset(leaf, fi);
1214                 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1215                                            num_bytes, parent,
1216                                            btrfs_header_owner(leaf),
1217                                            key.objectid, key.offset);
1218                 BUG_ON(ret);
1219
1220                 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1221                                         parent, btrfs_header_owner(leaf),
1222                                         key.objectid, key.offset);
1223                 BUG_ON(ret);
1224         }
1225         if (dirty)
1226                 btrfs_mark_buffer_dirty(leaf);
1227         return 0;
1228 }
1229
1230 static noinline_for_stack
1231 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1232                      struct btrfs_path *path, int level)
1233 {
1234         struct btrfs_disk_key key1;
1235         struct btrfs_disk_key key2;
1236         btrfs_node_key(eb, &key1, slot);
1237         btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1238         return memcmp(&key1, &key2, sizeof(key1));
1239 }
1240
1241 /*
1242  * try to replace tree blocks in fs tree with the new blocks
1243  * in reloc tree. tree blocks haven't been modified since the
1244  * reloc tree was create can be replaced.
1245  *
1246  * if a block was replaced, level of the block + 1 is returned.
1247  * if no block got replaced, 0 is returned. if there are other
1248  * errors, a negative error number is returned.
1249  */
1250 static int replace_path(struct btrfs_trans_handle *trans,
1251                         struct btrfs_root *dest, struct btrfs_root *src,
1252                         struct btrfs_path *path, struct btrfs_key *next_key,
1253                         struct extent_buffer **leaf,
1254                         int lowest_level, int max_level)
1255 {
1256         struct extent_buffer *eb;
1257         struct extent_buffer *parent;
1258         struct btrfs_key key;
1259         u64 old_bytenr;
1260         u64 new_bytenr;
1261         u64 old_ptr_gen;
1262         u64 new_ptr_gen;
1263         u64 last_snapshot;
1264         u32 blocksize;
1265         int level;
1266         int ret;
1267         int slot;
1268
1269         BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1270         BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1271         BUG_ON(lowest_level > 1 && leaf);
1272
1273         last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1274
1275         slot = path->slots[lowest_level];
1276         btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1277
1278         eb = btrfs_lock_root_node(dest);
1279         btrfs_set_lock_blocking(eb);
1280         level = btrfs_header_level(eb);
1281
1282         if (level < lowest_level) {
1283                 btrfs_tree_unlock(eb);
1284                 free_extent_buffer(eb);
1285                 return 0;
1286         }
1287
1288         ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1289         BUG_ON(ret);
1290         btrfs_set_lock_blocking(eb);
1291
1292         if (next_key) {
1293                 next_key->objectid = (u64)-1;
1294                 next_key->type = (u8)-1;
1295                 next_key->offset = (u64)-1;
1296         }
1297
1298         parent = eb;
1299         while (1) {
1300                 level = btrfs_header_level(parent);
1301                 BUG_ON(level < lowest_level);
1302
1303                 ret = btrfs_bin_search(parent, &key, level, &slot);
1304                 if (ret && slot > 0)
1305                         slot--;
1306
1307                 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1308                         btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1309
1310                 old_bytenr = btrfs_node_blockptr(parent, slot);
1311                 blocksize = btrfs_level_size(dest, level - 1);
1312                 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1313
1314                 if (level <= max_level) {
1315                         eb = path->nodes[level];
1316                         new_bytenr = btrfs_node_blockptr(eb,
1317                                                         path->slots[level]);
1318                         new_ptr_gen = btrfs_node_ptr_generation(eb,
1319                                                         path->slots[level]);
1320                 } else {
1321                         new_bytenr = 0;
1322                         new_ptr_gen = 0;
1323                 }
1324
1325                 if (new_bytenr > 0 && new_bytenr == old_bytenr) {
1326                         WARN_ON(1);
1327                         ret = level;
1328                         break;
1329                 }
1330
1331                 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1332                     memcmp_node_keys(parent, slot, path, level)) {
1333                         if (level <= lowest_level && !leaf) {
1334                                 ret = 0;
1335                                 break;
1336                         }
1337
1338                         eb = read_tree_block(dest, old_bytenr, blocksize,
1339                                              old_ptr_gen);
1340                         btrfs_tree_lock(eb);
1341                         ret = btrfs_cow_block(trans, dest, eb, parent,
1342                                               slot, &eb);
1343                         BUG_ON(ret);
1344                         btrfs_set_lock_blocking(eb);
1345
1346                         if (level <= lowest_level) {
1347                                 *leaf = eb;
1348                                 ret = 0;
1349                                 break;
1350                         }
1351
1352                         btrfs_tree_unlock(parent);
1353                         free_extent_buffer(parent);
1354
1355                         parent = eb;
1356                         continue;
1357                 }
1358
1359                 btrfs_node_key_to_cpu(path->nodes[level], &key,
1360                                       path->slots[level]);
1361                 btrfs_release_path(src, path);
1362
1363                 path->lowest_level = level;
1364                 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1365                 path->lowest_level = 0;
1366                 BUG_ON(ret);
1367
1368                 /*
1369                  * swap blocks in fs tree and reloc tree.
1370                  */
1371                 btrfs_set_node_blockptr(parent, slot, new_bytenr);
1372                 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1373                 btrfs_mark_buffer_dirty(parent);
1374
1375                 btrfs_set_node_blockptr(path->nodes[level],
1376                                         path->slots[level], old_bytenr);
1377                 btrfs_set_node_ptr_generation(path->nodes[level],
1378                                               path->slots[level], old_ptr_gen);
1379                 btrfs_mark_buffer_dirty(path->nodes[level]);
1380
1381                 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1382                                         path->nodes[level]->start,
1383                                         src->root_key.objectid, level - 1, 0);
1384                 BUG_ON(ret);
1385                 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1386                                         0, dest->root_key.objectid, level - 1,
1387                                         0);
1388                 BUG_ON(ret);
1389
1390                 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1391                                         path->nodes[level]->start,
1392                                         src->root_key.objectid, level - 1, 0);
1393                 BUG_ON(ret);
1394
1395                 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1396                                         0, dest->root_key.objectid, level - 1,
1397                                         0);
1398                 BUG_ON(ret);
1399
1400                 btrfs_unlock_up_safe(path, 0);
1401
1402                 ret = level;
1403                 break;
1404         }
1405         btrfs_tree_unlock(parent);
1406         free_extent_buffer(parent);
1407         return ret;
1408 }
1409
1410 /*
1411  * helper to find next relocated block in reloc tree
1412  */
1413 static noinline_for_stack
1414 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1415                        int *level)
1416 {
1417         struct extent_buffer *eb;
1418         int i;
1419         u64 last_snapshot;
1420         u32 nritems;
1421
1422         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1423
1424         for (i = 0; i < *level; i++) {
1425                 free_extent_buffer(path->nodes[i]);
1426                 path->nodes[i] = NULL;
1427         }
1428
1429         for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1430                 eb = path->nodes[i];
1431                 nritems = btrfs_header_nritems(eb);
1432                 while (path->slots[i] + 1 < nritems) {
1433                         path->slots[i]++;
1434                         if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1435                             last_snapshot)
1436                                 continue;
1437
1438                         *level = i;
1439                         return 0;
1440                 }
1441                 free_extent_buffer(path->nodes[i]);
1442                 path->nodes[i] = NULL;
1443         }
1444         return 1;
1445 }
1446
1447 /*
1448  * walk down reloc tree to find relocated block of lowest level
1449  */
1450 static noinline_for_stack
1451 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1452                          int *level)
1453 {
1454         struct extent_buffer *eb = NULL;
1455         int i;
1456         u64 bytenr;
1457         u64 ptr_gen = 0;
1458         u64 last_snapshot;
1459         u32 blocksize;
1460         u32 nritems;
1461
1462         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1463
1464         for (i = *level; i > 0; i--) {
1465                 eb = path->nodes[i];
1466                 nritems = btrfs_header_nritems(eb);
1467                 while (path->slots[i] < nritems) {
1468                         ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1469                         if (ptr_gen > last_snapshot)
1470                                 break;
1471                         path->slots[i]++;
1472                 }
1473                 if (path->slots[i] >= nritems) {
1474                         if (i == *level)
1475                                 break;
1476                         *level = i + 1;
1477                         return 0;
1478                 }
1479                 if (i == 1) {
1480                         *level = i;
1481                         return 0;
1482                 }
1483
1484                 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1485                 blocksize = btrfs_level_size(root, i - 1);
1486                 eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1487                 BUG_ON(btrfs_header_level(eb) != i - 1);
1488                 path->nodes[i - 1] = eb;
1489                 path->slots[i - 1] = 0;
1490         }
1491         return 1;
1492 }
1493
1494 /*
1495  * invalidate extent cache for file extents whose key in range of
1496  * [min_key, max_key)
1497  */
1498 static int invalidate_extent_cache(struct btrfs_root *root,
1499                                    struct btrfs_key *min_key,
1500                                    struct btrfs_key *max_key)
1501 {
1502         struct inode *inode = NULL;
1503         u64 objectid;
1504         u64 start, end;
1505
1506         objectid = min_key->objectid;
1507         while (1) {
1508                 cond_resched();
1509                 iput(inode);
1510
1511                 if (objectid > max_key->objectid)
1512                         break;
1513
1514                 inode = find_next_inode(root, objectid);
1515                 if (!inode)
1516                         break;
1517
1518                 if (inode->i_ino > max_key->objectid) {
1519                         iput(inode);
1520                         break;
1521                 }
1522
1523                 objectid = inode->i_ino + 1;
1524                 if (!S_ISREG(inode->i_mode))
1525                         continue;
1526
1527                 if (unlikely(min_key->objectid == inode->i_ino)) {
1528                         if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1529                                 continue;
1530                         if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1531                                 start = 0;
1532                         else {
1533                                 start = min_key->offset;
1534                                 WARN_ON(!IS_ALIGNED(start, root->sectorsize));
1535                         }
1536                 } else {
1537                         start = 0;
1538                 }
1539
1540                 if (unlikely(max_key->objectid == inode->i_ino)) {
1541                         if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1542                                 continue;
1543                         if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1544                                 end = (u64)-1;
1545                         } else {
1546                                 if (max_key->offset == 0)
1547                                         continue;
1548                                 end = max_key->offset;
1549                                 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1550                                 end--;
1551                         }
1552                 } else {
1553                         end = (u64)-1;
1554                 }
1555
1556                 /* the lock_extent waits for readpage to complete */
1557                 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1558                 btrfs_drop_extent_cache(inode, start, end, 1);
1559                 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1560         }
1561         return 0;
1562 }
1563
1564 static void put_inodes(struct list_head *list)
1565 {
1566         struct inodevec *ivec;
1567         while (!list_empty(list)) {
1568                 ivec = list_entry(list->next, struct inodevec, list);
1569                 list_del(&ivec->list);
1570                 while (ivec->nr > 0) {
1571                         ivec->nr--;
1572                         iput(ivec->inode[ivec->nr]);
1573                 }
1574                 kfree(ivec);
1575         }
1576 }
1577
1578 static int find_next_key(struct btrfs_path *path, int level,
1579                          struct btrfs_key *key)
1580
1581 {
1582         while (level < BTRFS_MAX_LEVEL) {
1583                 if (!path->nodes[level])
1584                         break;
1585                 if (path->slots[level] + 1 <
1586                     btrfs_header_nritems(path->nodes[level])) {
1587                         btrfs_node_key_to_cpu(path->nodes[level], key,
1588                                               path->slots[level] + 1);
1589                         return 0;
1590                 }
1591                 level++;
1592         }
1593         return 1;
1594 }
1595
1596 /*
1597  * merge the relocated tree blocks in reloc tree with corresponding
1598  * fs tree.
1599  */
1600 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1601                                                struct btrfs_root *root)
1602 {
1603         LIST_HEAD(inode_list);
1604         struct btrfs_key key;
1605         struct btrfs_key next_key;
1606         struct btrfs_trans_handle *trans;
1607         struct btrfs_root *reloc_root;
1608         struct btrfs_root_item *root_item;
1609         struct btrfs_path *path;
1610         struct extent_buffer *leaf = NULL;
1611         unsigned long nr;
1612         int level;
1613         int max_level;
1614         int replaced = 0;
1615         int ret;
1616         int err = 0;
1617
1618         path = btrfs_alloc_path();
1619         if (!path)
1620                 return -ENOMEM;
1621
1622         reloc_root = root->reloc_root;
1623         root_item = &reloc_root->root_item;
1624
1625         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1626                 level = btrfs_root_level(root_item);
1627                 extent_buffer_get(reloc_root->node);
1628                 path->nodes[level] = reloc_root->node;
1629                 path->slots[level] = 0;
1630         } else {
1631                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1632
1633                 level = root_item->drop_level;
1634                 BUG_ON(level == 0);
1635                 path->lowest_level = level;
1636                 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1637                 path->lowest_level = 0;
1638                 if (ret < 0) {
1639                         btrfs_free_path(path);
1640                         return ret;
1641                 }
1642
1643                 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1644                                       path->slots[level]);
1645                 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1646
1647                 btrfs_unlock_up_safe(path, 0);
1648         }
1649
1650         if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
1651                 trans = btrfs_start_transaction(root, 1);
1652
1653                 leaf = path->nodes[0];
1654                 btrfs_item_key_to_cpu(leaf, &key, 0);
1655                 btrfs_release_path(reloc_root, path);
1656
1657                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1658                 if (ret < 0) {
1659                         err = ret;
1660                         goto out;
1661                 }
1662
1663                 leaf = path->nodes[0];
1664                 btrfs_unlock_up_safe(path, 1);
1665                 ret = replace_file_extents(trans, rc, root, leaf,
1666                                            &inode_list);
1667                 if (ret < 0)
1668                         err = ret;
1669                 goto out;
1670         }
1671
1672         memset(&next_key, 0, sizeof(next_key));
1673
1674         while (1) {
1675                 leaf = NULL;
1676                 replaced = 0;
1677                 trans = btrfs_start_transaction(root, 1);
1678                 max_level = level;
1679
1680                 ret = walk_down_reloc_tree(reloc_root, path, &level);
1681                 if (ret < 0) {
1682                         err = ret;
1683                         goto out;
1684                 }
1685                 if (ret > 0)
1686                         break;
1687
1688                 if (!find_next_key(path, level, &key) &&
1689                     btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1690                         ret = 0;
1691                 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1692                         ret = replace_path(trans, root, reloc_root,
1693                                            path, &next_key, &leaf,
1694                                            level, max_level);
1695                 } else {
1696                         ret = replace_path(trans, root, reloc_root,
1697                                            path, &next_key, NULL,
1698                                            level, max_level);
1699                 }
1700                 if (ret < 0) {
1701                         err = ret;
1702                         goto out;
1703                 }
1704
1705                 if (ret > 0) {
1706                         level = ret;
1707                         btrfs_node_key_to_cpu(path->nodes[level], &key,
1708                                               path->slots[level]);
1709                         replaced = 1;
1710                 } else if (leaf) {
1711                         /*
1712                          * no block got replaced, try replacing file extents
1713                          */
1714                         btrfs_item_key_to_cpu(leaf, &key, 0);
1715                         ret = replace_file_extents(trans, rc, root, leaf,
1716                                                    &inode_list);
1717                         btrfs_tree_unlock(leaf);
1718                         free_extent_buffer(leaf);
1719                         BUG_ON(ret < 0);
1720                 }
1721
1722                 ret = walk_up_reloc_tree(reloc_root, path, &level);
1723                 if (ret > 0)
1724                         break;
1725
1726                 BUG_ON(level == 0);
1727                 /*
1728                  * save the merging progress in the drop_progress.
1729                  * this is OK since root refs == 1 in this case.
1730                  */
1731                 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1732                                path->slots[level]);
1733                 root_item->drop_level = level;
1734
1735                 nr = trans->blocks_used;
1736                 btrfs_end_transaction(trans, root);
1737
1738                 btrfs_btree_balance_dirty(root, nr);
1739
1740                 /*
1741                  * put inodes outside transaction, otherwise we may deadlock.
1742                  */
1743                 put_inodes(&inode_list);
1744
1745                 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1746                         invalidate_extent_cache(root, &key, &next_key);
1747         }
1748
1749         /*
1750          * handle the case only one block in the fs tree need to be
1751          * relocated and the block is tree root.
1752          */
1753         leaf = btrfs_lock_root_node(root);
1754         ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
1755         btrfs_tree_unlock(leaf);
1756         free_extent_buffer(leaf);
1757         if (ret < 0)
1758                 err = ret;
1759 out:
1760         btrfs_free_path(path);
1761
1762         if (err == 0) {
1763                 memset(&root_item->drop_progress, 0,
1764                        sizeof(root_item->drop_progress));
1765                 root_item->drop_level = 0;
1766                 btrfs_set_root_refs(root_item, 0);
1767         }
1768
1769         nr = trans->blocks_used;
1770         btrfs_end_transaction(trans, root);
1771
1772         btrfs_btree_balance_dirty(root, nr);
1773
1774         put_inodes(&inode_list);
1775
1776         if (replaced && rc->stage == UPDATE_DATA_PTRS)
1777                 invalidate_extent_cache(root, &key, &next_key);
1778
1779         return err;
1780 }
1781
1782 /*
1783  * callback for the work threads.
1784  * this function merges reloc tree with corresponding fs tree,
1785  * and then drops the reloc tree.
1786  */
1787 static void merge_func(struct btrfs_work *work)
1788 {
1789         struct btrfs_trans_handle *trans;
1790         struct btrfs_root *root;
1791         struct btrfs_root *reloc_root;
1792         struct async_merge *async;
1793
1794         async = container_of(work, struct async_merge, work);
1795         reloc_root = async->root;
1796
1797         if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1798                 root = read_fs_root(reloc_root->fs_info,
1799                                     reloc_root->root_key.offset);
1800                 BUG_ON(IS_ERR(root));
1801                 BUG_ON(root->reloc_root != reloc_root);
1802
1803                 merge_reloc_root(async->rc, root);
1804
1805                 trans = btrfs_start_transaction(root, 1);
1806                 btrfs_update_reloc_root(trans, root);
1807                 btrfs_end_transaction(trans, root);
1808         }
1809
1810         btrfs_drop_snapshot(reloc_root, 0);
1811
1812         if (atomic_dec_and_test(async->num_pending))
1813                 complete(async->done);
1814
1815         kfree(async);
1816 }
1817
1818 static int merge_reloc_roots(struct reloc_control *rc)
1819 {
1820         struct async_merge *async;
1821         struct btrfs_root *root;
1822         struct completion done;
1823         atomic_t num_pending;
1824
1825         init_completion(&done);
1826         atomic_set(&num_pending, 1);
1827
1828         while (!list_empty(&rc->reloc_roots)) {
1829                 root = list_entry(rc->reloc_roots.next,
1830                                   struct btrfs_root, root_list);
1831                 list_del_init(&root->root_list);
1832
1833                 async = kmalloc(sizeof(*async), GFP_NOFS);
1834                 BUG_ON(!async);
1835                 async->work.func = merge_func;
1836                 async->work.flags = 0;
1837                 async->rc = rc;
1838                 async->root = root;
1839                 async->done = &done;
1840                 async->num_pending = &num_pending;
1841                 atomic_inc(&num_pending);
1842                 btrfs_queue_worker(&rc->workers, &async->work);
1843         }
1844
1845         if (!atomic_dec_and_test(&num_pending))
1846                 wait_for_completion(&done);
1847
1848         BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1849         return 0;
1850 }
1851
1852 static void free_block_list(struct rb_root *blocks)
1853 {
1854         struct tree_block *block;
1855         struct rb_node *rb_node;
1856         while ((rb_node = rb_first(blocks))) {
1857                 block = rb_entry(rb_node, struct tree_block, rb_node);
1858                 rb_erase(rb_node, blocks);
1859                 kfree(block);
1860         }
1861 }
1862
1863 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1864                                       struct btrfs_root *reloc_root)
1865 {
1866         struct btrfs_root *root;
1867
1868         if (reloc_root->last_trans == trans->transid)
1869                 return 0;
1870
1871         root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
1872         BUG_ON(IS_ERR(root));
1873         BUG_ON(root->reloc_root != reloc_root);
1874
1875         return btrfs_record_root_in_trans(trans, root);
1876 }
1877
1878 /*
1879  * select one tree from trees that references the block.
1880  * for blocks in refernce counted trees, we preper reloc tree.
1881  * if no reloc tree found and reloc_only is true, NULL is returned.
1882  */
1883 static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1884                                             struct backref_node *node,
1885                                             struct backref_edge *edges[],
1886                                             int *nr, int reloc_only)
1887 {
1888         struct backref_node *next;
1889         struct btrfs_root *root;
1890         int index;
1891         int loop = 0;
1892 again:
1893         index = 0;
1894         next = node;
1895         while (1) {
1896                 cond_resched();
1897                 next = walk_up_backref(next, edges, &index);
1898                 root = next->root;
1899                 if (!root) {
1900                         BUG_ON(!node->old_root);
1901                         goto skip;
1902                 }
1903
1904                 /* no other choice for non-refernce counted tree */
1905                 if (!root->ref_cows) {
1906                         BUG_ON(reloc_only);
1907                         break;
1908                 }
1909
1910                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1911                         record_reloc_root_in_trans(trans, root);
1912                         break;
1913                 }
1914
1915                 if (loop) {
1916                         btrfs_record_root_in_trans(trans, root);
1917                         break;
1918                 }
1919
1920                 if (reloc_only || next != node) {
1921                         if (!root->reloc_root)
1922                                 btrfs_record_root_in_trans(trans, root);
1923                         root = root->reloc_root;
1924                         /*
1925                          * if the reloc tree was created in current
1926                          * transation, there is no node in backref tree
1927                          * corresponds to the root of the reloc tree.
1928                          */
1929                         if (btrfs_root_last_snapshot(&root->root_item) ==
1930                             trans->transid - 1)
1931                                 break;
1932                 }
1933 skip:
1934                 root = NULL;
1935                 next = walk_down_backref(edges, &index);
1936                 if (!next || next->level <= node->level)
1937                         break;
1938         }
1939
1940         if (!root && !loop && !reloc_only) {
1941                 loop = 1;
1942                 goto again;
1943         }
1944
1945         if (root)
1946                 *nr = index;
1947         else
1948                 *nr = 0;
1949
1950         return root;
1951 }
1952
1953 static noinline_for_stack
1954 struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1955                                    struct backref_node *node)
1956 {
1957         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1958         int nr;
1959         return __select_one_root(trans, node, edges, &nr, 0);
1960 }
1961
1962 static noinline_for_stack
1963 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1964                                      struct backref_node *node,
1965                                      struct backref_edge *edges[], int *nr)
1966 {
1967         return __select_one_root(trans, node, edges, nr, 1);
1968 }
1969
1970 static void grab_path_buffers(struct btrfs_path *path,
1971                               struct backref_node *node,
1972                               struct backref_edge *edges[], int nr)
1973 {
1974         int i = 0;
1975         while (1) {
1976                 drop_node_buffer(node);
1977                 node->eb = path->nodes[node->level];
1978                 BUG_ON(!node->eb);
1979                 if (path->locks[node->level])
1980                         node->locked = 1;
1981                 path->nodes[node->level] = NULL;
1982                 path->locks[node->level] = 0;
1983
1984                 if (i >= nr)
1985                         break;
1986
1987                 edges[i]->blockptr = node->eb->start;
1988                 node = edges[i]->node[UPPER];
1989                 i++;
1990         }
1991 }
1992
1993 /*
1994  * relocate a block tree, and then update pointers in upper level
1995  * blocks that reference the block to point to the new location.
1996  *
1997  * if called by link_to_upper, the block has already been relocated.
1998  * in that case this function just updates pointers.
1999  */
2000 static int do_relocation(struct btrfs_trans_handle *trans,
2001                          struct backref_node *node,
2002                          struct btrfs_key *key,
2003                          struct btrfs_path *path, int lowest)
2004 {
2005         struct backref_node *upper;
2006         struct backref_edge *edge;
2007         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2008         struct btrfs_root *root;
2009         struct extent_buffer *eb;
2010         u32 blocksize;
2011         u64 bytenr;
2012         u64 generation;
2013         int nr;
2014         int slot;
2015         int ret;
2016         int err = 0;
2017
2018         BUG_ON(lowest && node->eb);
2019
2020         path->lowest_level = node->level + 1;
2021         list_for_each_entry(edge, &node->upper, list[LOWER]) {
2022                 cond_resched();
2023                 if (node->eb && node->eb->start == edge->blockptr)
2024                         continue;
2025
2026                 upper = edge->node[UPPER];
2027                 root = select_reloc_root(trans, upper, edges, &nr);
2028                 if (!root)
2029                         continue;
2030
2031                 if (upper->eb && !upper->locked)
2032                         drop_node_buffer(upper);
2033
2034                 if (!upper->eb) {
2035                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2036                         if (ret < 0) {
2037                                 err = ret;
2038                                 break;
2039                         }
2040                         BUG_ON(ret > 0);
2041
2042                         slot = path->slots[upper->level];
2043
2044                         btrfs_unlock_up_safe(path, upper->level + 1);
2045                         grab_path_buffers(path, upper, edges, nr);
2046
2047                         btrfs_release_path(NULL, path);
2048                 } else {
2049                         ret = btrfs_bin_search(upper->eb, key, upper->level,
2050                                                &slot);
2051                         BUG_ON(ret);
2052                 }
2053
2054                 bytenr = btrfs_node_blockptr(upper->eb, slot);
2055                 if (!lowest) {
2056                         if (node->eb->start == bytenr) {
2057                                 btrfs_tree_unlock(upper->eb);
2058                                 upper->locked = 0;
2059                                 continue;
2060                         }
2061                 } else {
2062                         BUG_ON(node->bytenr != bytenr);
2063                 }
2064
2065                 blocksize = btrfs_level_size(root, node->level);
2066                 generation = btrfs_node_ptr_generation(upper->eb, slot);
2067                 eb = read_tree_block(root, bytenr, blocksize, generation);
2068                 btrfs_tree_lock(eb);
2069                 btrfs_set_lock_blocking(eb);
2070
2071                 if (!node->eb) {
2072                         ret = btrfs_cow_block(trans, root, eb, upper->eb,
2073                                               slot, &eb);
2074                         if (ret < 0) {
2075                                 err = ret;
2076                                 break;
2077                         }
2078                         btrfs_set_lock_blocking(eb);
2079                         node->eb = eb;
2080                         node->locked = 1;
2081                 } else {
2082                         btrfs_set_node_blockptr(upper->eb, slot,
2083                                                 node->eb->start);
2084                         btrfs_set_node_ptr_generation(upper->eb, slot,
2085                                                       trans->transid);
2086                         btrfs_mark_buffer_dirty(upper->eb);
2087
2088                         ret = btrfs_inc_extent_ref(trans, root,
2089                                                 node->eb->start, blocksize,
2090                                                 upper->eb->start,
2091                                                 btrfs_header_owner(upper->eb),
2092                                                 node->level, 0);
2093                         BUG_ON(ret);
2094
2095                         ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2096                         BUG_ON(ret);
2097                 }
2098                 if (!lowest) {
2099                         btrfs_tree_unlock(upper->eb);
2100                         upper->locked = 0;
2101                 }
2102         }
2103         path->lowest_level = 0;
2104         return err;
2105 }
2106
2107 static int link_to_upper(struct btrfs_trans_handle *trans,
2108                          struct backref_node *node,
2109                          struct btrfs_path *path)
2110 {
2111         struct btrfs_key key;
2112         if (!node->eb || list_empty(&node->upper))
2113                 return 0;
2114
2115         btrfs_node_key_to_cpu(node->eb, &key, 0);
2116         return do_relocation(trans, node, &key, path, 0);
2117 }
2118
2119 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2120                                 struct backref_cache *cache,
2121                                 struct btrfs_path *path)
2122 {
2123         struct backref_node *node;
2124         int level;
2125         int ret;
2126         int err = 0;
2127
2128         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2129                 while (!list_empty(&cache->pending[level])) {
2130                         node = list_entry(cache->pending[level].next,
2131                                           struct backref_node, lower);
2132                         BUG_ON(node->level != level);
2133
2134                         ret = link_to_upper(trans, node, path);
2135                         if (ret < 0)
2136                                 err = ret;
2137                         /*
2138                          * this remove the node from the pending list and
2139                          * may add some other nodes to the level + 1
2140                          * pending list
2141                          */
2142                         remove_backref_node(cache, node);
2143                 }
2144         }
2145         BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2146         return err;
2147 }
2148
2149 static void mark_block_processed(struct reloc_control *rc,
2150                                  struct backref_node *node)
2151 {
2152         u32 blocksize;
2153         if (node->level == 0 ||
2154             in_block_group(node->bytenr, rc->block_group)) {
2155                 blocksize = btrfs_level_size(rc->extent_root, node->level);
2156                 set_extent_bits(&rc->processed_blocks, node->bytenr,
2157                                 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2158                                 GFP_NOFS);
2159         }
2160         node->processed = 1;
2161 }
2162
2163 /*
2164  * mark a block and all blocks directly/indirectly reference the block
2165  * as processed.
2166  */
2167 static void update_processed_blocks(struct reloc_control *rc,
2168                                     struct backref_node *node)
2169 {
2170         struct backref_node *next = node;
2171         struct backref_edge *edge;
2172         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2173         int index = 0;
2174
2175         while (next) {
2176                 cond_resched();
2177                 while (1) {
2178                         if (next->processed)
2179                                 break;
2180
2181                         mark_block_processed(rc, next);
2182
2183                         if (list_empty(&next->upper))
2184                                 break;
2185
2186                         edge = list_entry(next->upper.next,
2187                                           struct backref_edge, list[LOWER]);
2188                         edges[index++] = edge;
2189                         next = edge->node[UPPER];
2190                 }
2191                 next = walk_down_backref(edges, &index);
2192         }
2193 }
2194
2195 static int tree_block_processed(u64 bytenr, u32 blocksize,
2196                                 struct reloc_control *rc)
2197 {
2198         if (test_range_bit(&rc->processed_blocks, bytenr,
2199                            bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2200                 return 1;
2201         return 0;
2202 }
2203
2204 /*
2205  * check if there are any file extent pointers in the leaf point to
2206  * data require processing
2207  */
2208 static int check_file_extents(struct reloc_control *rc,
2209                               u64 bytenr, u32 blocksize, u64 ptr_gen)
2210 {
2211         struct btrfs_key found_key;
2212         struct btrfs_file_extent_item *fi;
2213         struct extent_buffer *leaf;
2214         u32 nritems;
2215         int i;
2216         int ret = 0;
2217
2218         leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2219
2220         nritems = btrfs_header_nritems(leaf);
2221         for (i = 0; i < nritems; i++) {
2222                 cond_resched();
2223                 btrfs_item_key_to_cpu(leaf, &found_key, i);
2224                 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2225                         continue;
2226                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2227                 if (btrfs_file_extent_type(leaf, fi) ==
2228                     BTRFS_FILE_EXTENT_INLINE)
2229                         continue;
2230                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2231                 if (bytenr == 0)
2232                         continue;
2233                 if (in_block_group(bytenr, rc->block_group)) {
2234                         ret = 1;
2235                         break;
2236                 }
2237         }
2238         free_extent_buffer(leaf);
2239         return ret;
2240 }
2241
2242 /*
2243  * scan child blocks of a given block to find blocks require processing
2244  */
2245 static int add_child_blocks(struct btrfs_trans_handle *trans,
2246                             struct reloc_control *rc,
2247                             struct backref_node *node,
2248                             struct rb_root *blocks)
2249 {
2250         struct tree_block *block;
2251         struct rb_node *rb_node;
2252         u64 bytenr;
2253         u64 ptr_gen;
2254         u32 blocksize;
2255         u32 nritems;
2256         int i;
2257         int err = 0;
2258
2259         nritems = btrfs_header_nritems(node->eb);
2260         blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2261         for (i = 0; i < nritems; i++) {
2262                 cond_resched();
2263                 bytenr = btrfs_node_blockptr(node->eb, i);
2264                 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2265                 if (ptr_gen == trans->transid)
2266                         continue;
2267                 if (!in_block_group(bytenr, rc->block_group) &&
2268                     (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2269                         continue;
2270                 if (tree_block_processed(bytenr, blocksize, rc))
2271                         continue;
2272
2273                 readahead_tree_block(rc->extent_root,
2274                                      bytenr, blocksize, ptr_gen);
2275         }
2276
2277         for (i = 0; i < nritems; i++) {
2278                 cond_resched();
2279                 bytenr = btrfs_node_blockptr(node->eb, i);
2280                 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2281                 if (ptr_gen == trans->transid)
2282                         continue;
2283                 if (!in_block_group(bytenr, rc->block_group) &&
2284                     (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2285                         continue;
2286                 if (tree_block_processed(bytenr, blocksize, rc))
2287                         continue;
2288                 if (!in_block_group(bytenr, rc->block_group) &&
2289                     !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2290                         continue;
2291
2292                 block = kmalloc(sizeof(*block), GFP_NOFS);
2293                 if (!block) {
2294                         err = -ENOMEM;
2295                         break;
2296                 }
2297                 block->bytenr = bytenr;
2298                 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2299                 block->level = node->level - 1;
2300                 block->key_ready = 1;
2301                 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2302                 BUG_ON(rb_node);
2303         }
2304         if (err)
2305                 free_block_list(blocks);
2306         return err;
2307 }
2308
2309 /*
2310  * find adjacent blocks require processing
2311  */
2312 static noinline_for_stack
2313 int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2314                         struct reloc_control *rc,
2315                         struct backref_cache *cache,
2316                         struct rb_root *blocks, int level,
2317                         struct backref_node **upper)
2318 {
2319         struct backref_node *node;
2320         int ret = 0;
2321
2322         WARN_ON(!list_empty(&cache->pending[level]));
2323
2324         if (list_empty(&cache->pending[level + 1]))
2325                 return 1;
2326
2327         node = list_entry(cache->pending[level + 1].next,
2328                           struct backref_node, lower);
2329         if (node->eb)
2330                 ret = add_child_blocks(trans, rc, node, blocks);
2331
2332         *upper = node;
2333         return ret;
2334 }
2335
2336 static int get_tree_block_key(struct reloc_control *rc,
2337                               struct tree_block *block)
2338 {
2339         struct extent_buffer *eb;
2340
2341         BUG_ON(block->key_ready);
2342         eb = read_tree_block(rc->extent_root, block->bytenr,
2343                              block->key.objectid, block->key.offset);
2344         WARN_ON(btrfs_header_level(eb) != block->level);
2345         if (block->level == 0)
2346                 btrfs_item_key_to_cpu(eb, &block->key, 0);
2347         else
2348                 btrfs_node_key_to_cpu(eb, &block->key, 0);
2349         free_extent_buffer(eb);
2350         block->key_ready = 1;
2351         return 0;
2352 }
2353
2354 static int reada_tree_block(struct reloc_control *rc,
2355                             struct tree_block *block)
2356 {
2357         BUG_ON(block->key_ready);
2358         readahead_tree_block(rc->extent_root, block->bytenr,
2359                              block->key.objectid, block->key.offset);
2360         return 0;
2361 }
2362
2363 /*
2364  * helper function to relocate a tree block
2365  */
2366 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2367                                 struct reloc_control *rc,
2368                                 struct backref_node *node,
2369                                 struct btrfs_key *key,
2370                                 struct btrfs_path *path)
2371 {
2372         struct btrfs_root *root;
2373         int ret;
2374
2375         root = select_one_root(trans, node);
2376         if (unlikely(!root)) {
2377                 rc->found_old_snapshot = 1;
2378                 update_processed_blocks(rc, node);
2379                 return 0;
2380         }
2381
2382         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2383                 ret = do_relocation(trans, node, key, path, 1);
2384                 if (ret < 0)
2385                         goto out;
2386                 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2387                         ret = replace_file_extents(trans, rc, root,
2388                                                    node->eb, NULL);
2389                         if (ret < 0)
2390                                 goto out;
2391                 }
2392                 drop_node_buffer(node);
2393         } else if (!root->ref_cows) {
2394                 path->lowest_level = node->level;
2395                 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2396                 btrfs_release_path(root, path);
2397                 if (ret < 0)
2398                         goto out;
2399         } else if (root != node->root) {
2400                 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2401         }
2402
2403         update_processed_blocks(rc, node);
2404         ret = 0;
2405 out:
2406         drop_node_buffer(node);
2407         return ret;
2408 }
2409
2410 /*
2411  * relocate a list of blocks
2412  */
2413 static noinline_for_stack
2414 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2415                          struct reloc_control *rc, struct rb_root *blocks)
2416 {
2417         struct backref_cache *cache;
2418         struct backref_node *node;
2419         struct btrfs_path *path;
2420         struct tree_block *block;
2421         struct rb_node *rb_node;
2422         int level = -1;
2423         int ret;
2424         int err = 0;
2425
2426         path = btrfs_alloc_path();
2427         if (!path)
2428                 return -ENOMEM;
2429
2430         cache = kmalloc(sizeof(*cache), GFP_NOFS);
2431         if (!cache) {
2432                 btrfs_free_path(path);
2433                 return -ENOMEM;
2434         }
2435
2436         backref_cache_init(cache);
2437
2438         rb_node = rb_first(blocks);
2439         while (rb_node) {
2440                 block = rb_entry(rb_node, struct tree_block, rb_node);
2441                 if (level == -1)
2442                         level = block->level;
2443                 else
2444                         BUG_ON(level != block->level);
2445                 if (!block->key_ready)
2446                         reada_tree_block(rc, block);
2447                 rb_node = rb_next(rb_node);
2448         }
2449
2450         rb_node = rb_first(blocks);
2451         while (rb_node) {
2452                 block = rb_entry(rb_node, struct tree_block, rb_node);
2453                 if (!block->key_ready)
2454                         get_tree_block_key(rc, block);
2455                 rb_node = rb_next(rb_node);
2456         }
2457
2458         rb_node = rb_first(blocks);
2459         while (rb_node) {
2460                 block = rb_entry(rb_node, struct tree_block, rb_node);
2461
2462                 node = build_backref_tree(rc, cache, &block->key,
2463                                           block->level, block->bytenr);
2464                 if (IS_ERR(node)) {
2465                         err = PTR_ERR(node);
2466                         goto out;
2467                 }
2468
2469                 ret = relocate_tree_block(trans, rc, node, &block->key,
2470                                           path);
2471                 if (ret < 0) {
2472                         err = ret;
2473                         goto out;
2474                 }
2475                 remove_backref_node(cache, node);
2476                 rb_node = rb_next(rb_node);
2477         }
2478
2479         if (level > 0)
2480                 goto out;
2481
2482         free_block_list(blocks);
2483
2484         /*
2485          * now backrefs of some upper level tree blocks have been cached,
2486          * try relocating blocks referenced by these upper level blocks.
2487          */
2488         while (1) {
2489                 struct backref_node *upper = NULL;
2490                 if (trans->transaction->in_commit ||
2491                     trans->transaction->delayed_refs.flushing)
2492                         break;
2493
2494                 ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
2495                                           &upper);
2496                 if (ret < 0)
2497                         err = ret;
2498                 if (ret != 0)
2499                         break;
2500
2501                 rb_node = rb_first(blocks);
2502                 while (rb_node) {
2503                         block = rb_entry(rb_node, struct tree_block, rb_node);
2504                         if (trans->transaction->in_commit ||
2505                             trans->transaction->delayed_refs.flushing)
2506                                 goto out;
2507                         BUG_ON(!block->key_ready);
2508                         node = build_backref_tree(rc, cache, &block->key,
2509                                                   level, block->bytenr);
2510                         if (IS_ERR(node)) {
2511                                 err = PTR_ERR(node);
2512                                 goto out;
2513                         }
2514
2515                         ret = relocate_tree_block(trans, rc, node,
2516                                                   &block->key, path);
2517                         if (ret < 0) {
2518                                 err = ret;
2519                                 goto out;
2520                         }
2521                         remove_backref_node(cache, node);
2522                         rb_node = rb_next(rb_node);
2523                 }
2524                 free_block_list(blocks);
2525
2526                 if (upper) {
2527                         ret = link_to_upper(trans, upper, path);
2528                         if (ret < 0) {
2529                                 err = ret;
2530                                 break;
2531                         }
2532                         remove_backref_node(cache, upper);
2533                 }
2534         }
2535 out:
2536         free_block_list(blocks);
2537
2538         ret = finish_pending_nodes(trans, cache, path);
2539         if (ret < 0)
2540                 err = ret;
2541
2542         kfree(cache);
2543         btrfs_free_path(path);
2544         return err;
2545 }
2546
2547 static noinline_for_stack
2548 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2549                          u64 block_start)
2550 {
2551         struct btrfs_root *root = BTRFS_I(inode)->root;
2552         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2553         struct extent_map *em;
2554         int ret = 0;
2555
2556         em = alloc_extent_map(GFP_NOFS);
2557         if (!em)
2558                 return -ENOMEM;
2559
2560         em->start = start;
2561         em->len = end + 1 - start;
2562         em->block_len = em->len;
2563         em->block_start = block_start;
2564         em->bdev = root->fs_info->fs_devices->latest_bdev;
2565         set_bit(EXTENT_FLAG_PINNED, &em->flags);
2566
2567         lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2568         while (1) {
2569                 write_lock(&em_tree->lock);
2570                 ret = add_extent_mapping(em_tree, em);
2571                 write_unlock(&em_tree->lock);
2572                 if (ret != -EEXIST) {
2573                         free_extent_map(em);
2574                         break;
2575                 }
2576                 btrfs_drop_extent_cache(inode, start, end, 0);
2577         }
2578         unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2579         return ret;
2580 }
2581
2582 static int relocate_file_extent_cluster(struct inode *inode,
2583                                         struct file_extent_cluster *cluster)
2584 {
2585         u64 page_start;
2586         u64 page_end;
2587         u64 offset = BTRFS_I(inode)->index_cnt;
2588         unsigned long index;
2589         unsigned long last_index;
2590         unsigned int dirty_page = 0;
2591         struct page *page;
2592         struct file_ra_state *ra;
2593         int nr = 0;
2594         int ret = 0;
2595
2596         if (!cluster->nr)
2597                 return 0;
2598
2599         ra = kzalloc(sizeof(*ra), GFP_NOFS);
2600         if (!ra)
2601                 return -ENOMEM;
2602
2603         index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2604         last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2605
2606         mutex_lock(&inode->i_mutex);
2607
2608         i_size_write(inode, cluster->end + 1 - offset);
2609         ret = setup_extent_mapping(inode, cluster->start - offset,
2610                                    cluster->end - offset, cluster->start);
2611         if (ret)
2612                 goto out_unlock;
2613
2614         file_ra_state_init(ra, inode->i_mapping);
2615
2616         WARN_ON(cluster->start != cluster->boundary[0]);
2617         while (index <= last_index) {
2618                 page = find_lock_page(inode->i_mapping, index);
2619                 if (!page) {
2620                         page_cache_sync_readahead(inode->i_mapping,
2621                                                   ra, NULL, index,
2622                                                   last_index + 1 - index);
2623                         page = grab_cache_page(inode->i_mapping, index);
2624                         if (!page) {
2625                                 ret = -ENOMEM;
2626                                 goto out_unlock;
2627                         }
2628                 }
2629
2630                 if (PageReadahead(page)) {
2631                         page_cache_async_readahead(inode->i_mapping,
2632                                                    ra, NULL, page, index,
2633                                                    last_index + 1 - index);
2634                 }
2635
2636                 if (!PageUptodate(page)) {
2637                         btrfs_readpage(NULL, page);
2638                         lock_page(page);
2639                         if (!PageUptodate(page)) {
2640                                 unlock_page(page);
2641                                 page_cache_release(page);
2642                                 ret = -EIO;
2643                                 goto out_unlock;
2644                         }
2645                 }
2646
2647                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2648                 page_end = page_start + PAGE_CACHE_SIZE - 1;
2649
2650                 lock_extent(&BTRFS_I(inode)->io_tree,
2651                             page_start, page_end, GFP_NOFS);
2652
2653                 set_page_extent_mapped(page);
2654
2655                 if (nr < cluster->nr &&
2656                     page_start + offset == cluster->boundary[nr]) {
2657                         set_extent_bits(&BTRFS_I(inode)->io_tree,
2658                                         page_start, page_end,
2659                                         EXTENT_BOUNDARY, GFP_NOFS);
2660                         nr++;
2661                 }
2662                 btrfs_set_extent_delalloc(inode, page_start, page_end);
2663
2664                 set_page_dirty(page);
2665                 dirty_page++;
2666
2667                 unlock_extent(&BTRFS_I(inode)->io_tree,
2668                               page_start, page_end, GFP_NOFS);
2669                 unlock_page(page);
2670                 page_cache_release(page);
2671
2672                 index++;
2673                 if (nr < cluster->nr &&
2674                     page_end + 1 + offset == cluster->boundary[nr]) {
2675                         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2676                                                            dirty_page);
2677                         dirty_page = 0;
2678                 }
2679         }
2680         if (dirty_page) {
2681                 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2682                                                    dirty_page);
2683         }
2684         WARN_ON(nr != cluster->nr);
2685 out_unlock:
2686         mutex_unlock(&inode->i_mutex);
2687         kfree(ra);
2688         return ret;
2689 }
2690
2691 static noinline_for_stack
2692 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
2693                          struct file_extent_cluster *cluster)
2694 {
2695         int ret;
2696
2697         if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
2698                 ret = relocate_file_extent_cluster(inode, cluster);
2699                 if (ret)
2700                         return ret;
2701                 cluster->nr = 0;
2702         }
2703
2704         if (!cluster->nr)
2705                 cluster->start = extent_key->objectid;
2706         else
2707                 BUG_ON(cluster->nr >= MAX_EXTENTS);
2708         cluster->end = extent_key->objectid + extent_key->offset - 1;
2709         cluster->boundary[cluster->nr] = extent_key->objectid;
2710         cluster->nr++;
2711
2712         if (cluster->nr >= MAX_EXTENTS) {
2713                 ret = relocate_file_extent_cluster(inode, cluster);
2714                 if (ret)
2715                         return ret;
2716                 cluster->nr = 0;
2717         }
2718         return 0;
2719 }
2720
2721 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2722 static int get_ref_objectid_v0(struct reloc_control *rc,
2723                                struct btrfs_path *path,
2724                                struct btrfs_key *extent_key,
2725                                u64 *ref_objectid, int *path_change)
2726 {
2727         struct btrfs_key key;
2728         struct extent_buffer *leaf;
2729         struct btrfs_extent_ref_v0 *ref0;
2730         int ret;
2731         int slot;
2732
2733         leaf = path->nodes[0];
2734         slot = path->slots[0];
2735         while (1) {
2736                 if (slot >= btrfs_header_nritems(leaf)) {
2737                         ret = btrfs_next_leaf(rc->extent_root, path);
2738                         if (ret < 0)
2739                                 return ret;
2740                         BUG_ON(ret > 0);
2741                         leaf = path->nodes[0];
2742                         slot = path->slots[0];
2743                         if (path_change)
2744                                 *path_change = 1;
2745                 }
2746                 btrfs_item_key_to_cpu(leaf, &key, slot);
2747                 if (key.objectid != extent_key->objectid)
2748                         return -ENOENT;
2749
2750                 if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
2751                         slot++;
2752                         continue;
2753                 }
2754                 ref0 = btrfs_item_ptr(leaf, slot,
2755                                 struct btrfs_extent_ref_v0);
2756                 *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
2757                 break;
2758         }
2759         return 0;
2760 }
2761 #endif
2762
2763 /*
2764  * helper to add a tree block to the list.
2765  * the major work is getting the generation and level of the block
2766  */
2767 static int add_tree_block(struct reloc_control *rc,
2768                           struct btrfs_key *extent_key,
2769                           struct btrfs_path *path,
2770                           struct rb_root *blocks)
2771 {
2772         struct extent_buffer *eb;
2773         struct btrfs_extent_item *ei;
2774         struct btrfs_tree_block_info *bi;
2775         struct tree_block *block;
2776         struct rb_node *rb_node;
2777         u32 item_size;
2778         int level = -1;
2779         int generation;
2780
2781         eb =  path->nodes[0];
2782         item_size = btrfs_item_size_nr(eb, path->slots[0]);
2783
2784         if (item_size >= sizeof(*ei) + sizeof(*bi)) {
2785                 ei = btrfs_item_ptr(eb, path->slots[0],
2786                                 struct btrfs_extent_item);
2787                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2788                 generation = btrfs_extent_generation(eb, ei);
2789                 level = btrfs_tree_block_level(eb, bi);
2790         } else {
2791 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2792                 u64 ref_owner;
2793                 int ret;
2794
2795                 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2796                 ret = get_ref_objectid_v0(rc, path, extent_key,
2797                                           &ref_owner, NULL);
2798                 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
2799                 level = (int)ref_owner;
2800                 /* FIXME: get real generation */
2801                 generation = 0;
2802 #else
2803                 BUG();
2804 #endif
2805         }
2806
2807         btrfs_release_path(rc->extent_root, path);
2808
2809         BUG_ON(level == -1);
2810
2811         block = kmalloc(sizeof(*block), GFP_NOFS);
2812         if (!block)
2813                 return -ENOMEM;
2814
2815         block->bytenr = extent_key->objectid;
2816         block->key.objectid = extent_key->offset;
2817         block->key.offset = generation;
2818         block->level = level;
2819         block->key_ready = 0;
2820
2821         rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2822         BUG_ON(rb_node);
2823
2824         return 0;
2825 }
2826
2827 /*
2828  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
2829  */
2830 static int __add_tree_block(struct reloc_control *rc,
2831                             u64 bytenr, u32 blocksize,
2832                             struct rb_root *blocks)
2833 {
2834         struct btrfs_path *path;
2835         struct btrfs_key key;
2836         int ret;
2837
2838         if (tree_block_processed(bytenr, blocksize, rc))
2839                 return 0;
2840
2841         if (tree_search(blocks, bytenr))
2842                 return 0;
2843
2844         path = btrfs_alloc_path();
2845         if (!path)
2846                 return -ENOMEM;
2847
2848         key.objectid = bytenr;
2849         key.type = BTRFS_EXTENT_ITEM_KEY;
2850         key.offset = blocksize;
2851
2852         path->search_commit_root = 1;
2853         path->skip_locking = 1;
2854         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
2855         if (ret < 0)
2856                 goto out;
2857         BUG_ON(ret);
2858
2859         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2860         ret = add_tree_block(rc, &key, path, blocks);
2861 out:
2862         btrfs_free_path(path);
2863         return ret;
2864 }
2865
2866 /*
2867  * helper to check if the block use full backrefs for pointers in it
2868  */
2869 static int block_use_full_backref(struct reloc_control *rc,
2870                                   struct extent_buffer *eb)
2871 {
2872         struct btrfs_path *path;
2873         struct btrfs_extent_item *ei;
2874         struct btrfs_key key;
2875         u64 flags;
2876         int ret;
2877
2878         if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
2879             btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2880                 return 1;
2881
2882         path = btrfs_alloc_path();
2883         BUG_ON(!path);
2884
2885         key.objectid = eb->start;
2886         key.type = BTRFS_EXTENT_ITEM_KEY;
2887         key.offset = eb->len;
2888
2889         path->search_commit_root = 1;
2890         path->skip_locking = 1;
2891         ret = btrfs_search_slot(NULL, rc->extent_root,
2892                                 &key, path, 0, 0);
2893         BUG_ON(ret);
2894
2895         ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2896                             struct btrfs_extent_item);
2897         flags = btrfs_extent_flags(path->nodes[0], ei);
2898         BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2899         if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2900                 ret = 1;
2901         else
2902                 ret = 0;
2903         btrfs_free_path(path);
2904         return ret;
2905 }
2906
2907 /*
2908  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
2909  * this function scans fs tree to find blocks reference the data extent
2910  */
2911 static int find_data_references(struct reloc_control *rc,
2912                                 struct btrfs_key *extent_key,
2913                                 struct extent_buffer *leaf,
2914                                 struct btrfs_extent_data_ref *ref,
2915                                 struct rb_root *blocks)
2916 {
2917         struct btrfs_path *path;
2918         struct tree_block *block;
2919         struct btrfs_root *root;
2920         struct btrfs_file_extent_item *fi;
2921         struct rb_node *rb_node;
2922         struct btrfs_key key;
2923         u64 ref_root;
2924         u64 ref_objectid;
2925         u64 ref_offset;
2926         u32 ref_count;
2927         u32 nritems;
2928         int err = 0;
2929         int added = 0;
2930         int counted;
2931         int ret;
2932
2933         path = btrfs_alloc_path();
2934         if (!path)
2935                 return -ENOMEM;
2936
2937         ref_root = btrfs_extent_data_ref_root(leaf, ref);
2938         ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
2939         ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
2940         ref_count = btrfs_extent_data_ref_count(leaf, ref);
2941
2942         root = read_fs_root(rc->extent_root->fs_info, ref_root);
2943         if (IS_ERR(root)) {
2944                 err = PTR_ERR(root);
2945                 goto out;
2946         }
2947
2948         key.objectid = ref_objectid;
2949         key.offset = ref_offset;
2950         key.type = BTRFS_EXTENT_DATA_KEY;
2951
2952         path->search_commit_root = 1;
2953         path->skip_locking = 1;
2954         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2955         if (ret < 0) {
2956                 err = ret;
2957                 goto out;
2958         }
2959
2960         leaf = path->nodes[0];
2961         nritems = btrfs_header_nritems(leaf);
2962         /*
2963          * the references in tree blocks that use full backrefs
2964          * are not counted in
2965          */
2966         if (block_use_full_backref(rc, leaf))
2967                 counted = 0;
2968         else
2969                 counted = 1;
2970         rb_node = tree_search(blocks, leaf->start);
2971         if (rb_node) {
2972                 if (counted)
2973                         added = 1;
2974                 else
2975                         path->slots[0] = nritems;
2976         }
2977
2978         while (ref_count > 0) {
2979                 while (path->slots[0] >= nritems) {
2980                         ret = btrfs_next_leaf(root, path);
2981                         if (ret < 0) {
2982                                 err = ret;
2983                                 goto out;
2984                         }
2985                         if (ret > 0) {
2986                                 WARN_ON(1);
2987                                 goto out;
2988                         }
2989
2990                         leaf = path->nodes[0];
2991                         nritems = btrfs_header_nritems(leaf);
2992                         added = 0;
2993
2994                         if (block_use_full_backref(rc, leaf))
2995                                 counted = 0;
2996                         else
2997                                 counted = 1;
2998                         rb_node = tree_search(blocks, leaf->start);
2999                         if (rb_node) {
3000                                 if (counted)
3001                                         added = 1;
3002                                 else
3003                                         path->slots[0] = nritems;
3004                         }
3005                 }
3006
3007                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3008                 if (key.objectid != ref_objectid ||
3009                     key.type != BTRFS_EXTENT_DATA_KEY) {
3010                         WARN_ON(1);
3011                         break;
3012                 }
3013
3014                 fi = btrfs_item_ptr(leaf, path->slots[0],
3015                                     struct btrfs_file_extent_item);
3016
3017                 if (btrfs_file_extent_type(leaf, fi) ==
3018                     BTRFS_FILE_EXTENT_INLINE)
3019                         goto next;
3020
3021                 if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3022                     extent_key->objectid)
3023                         goto next;
3024
3025                 key.offset -= btrfs_file_extent_offset(leaf, fi);
3026                 if (key.offset != ref_offset)
3027                         goto next;
3028
3029                 if (counted)
3030                         ref_count--;
3031                 if (added)
3032                         goto next;
3033
3034                 if (!tree_block_processed(leaf->start, leaf->len, rc)) {
3035                         block = kmalloc(sizeof(*block), GFP_NOFS);
3036                         if (!block) {
3037                                 err = -ENOMEM;
3038                                 break;
3039                         }
3040                         block->bytenr = leaf->start;
3041                         btrfs_item_key_to_cpu(leaf, &block->key, 0);
3042                         block->level = 0;
3043                         block->key_ready = 1;
3044                         rb_node = tree_insert(blocks, block->bytenr,
3045                                               &block->rb_node);
3046                         BUG_ON(rb_node);
3047                 }
3048                 if (counted)
3049                         added = 1;
3050                 else
3051                         path->slots[0] = nritems;
3052 next:
3053                 path->slots[0]++;
3054
3055         }
3056 out:
3057         btrfs_free_path(path);
3058         return err;
3059 }
3060
3061 /*
3062  * hepler to find all tree blocks that reference a given data extent
3063  */
3064 static noinline_for_stack
3065 int add_data_references(struct reloc_control *rc,
3066                         struct btrfs_key *extent_key,
3067                         struct btrfs_path *path,
3068                         struct rb_root *blocks)
3069 {
3070         struct btrfs_key key;
3071         struct extent_buffer *eb;
3072         struct btrfs_extent_data_ref *dref;
3073         struct btrfs_extent_inline_ref *iref;
3074         unsigned long ptr;
3075         unsigned long end;
3076         u32 blocksize;
3077         int ret;
3078         int err = 0;
3079
3080         ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3081                                extent_key->offset);
3082         BUG_ON(ret < 0);
3083         if (ret > 0) {
3084                 /* the relocated data is fragmented */
3085                 rc->extents_skipped++;
3086                 btrfs_release_path(rc->extent_root, path);
3087                 return 0;
3088         }
3089
3090         blocksize = btrfs_level_size(rc->extent_root, 0);
3091
3092         eb = path->nodes[0];
3093         ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3094         end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3095 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3096         if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3097                 ptr = end;
3098         else
3099 #endif
3100                 ptr += sizeof(struct btrfs_extent_item);
3101
3102         while (ptr < end) {
3103                 iref = (struct btrfs_extent_inline_ref *)ptr;
3104                 key.type = btrfs_extent_inline_ref_type(eb, iref);
3105                 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3106                         key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3107                         ret = __add_tree_block(rc, key.offset, blocksize,
3108                                                blocks);
3109                 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3110                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3111                         ret = find_data_references(rc, extent_key,
3112                                                    eb, dref, blocks);
3113                 } else {
3114                         BUG();
3115                 }
3116                 ptr += btrfs_extent_inline_ref_size(key.type);
3117         }
3118         WARN_ON(ptr > end);
3119
3120         while (1) {
3121                 cond_resched();
3122                 eb = path->nodes[0];
3123                 if (path->slots[0] >= btrfs_header_nritems(eb)) {
3124                         ret = btrfs_next_leaf(rc->extent_root, path);
3125                         if (ret < 0) {
3126                                 err = ret;
3127                                 break;
3128                         }
3129                         if (ret > 0)
3130                                 break;
3131                         eb = path->nodes[0];
3132                 }
3133
3134                 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3135                 if (key.objectid != extent_key->objectid)
3136                         break;
3137
3138 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3139                 if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3140                     key.type == BTRFS_EXTENT_REF_V0_KEY) {
3141 #else
3142                 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3143                 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3144 #endif
3145                         ret = __add_tree_block(rc, key.offset, blocksize,
3146                                                blocks);
3147                 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3148                         dref = btrfs_item_ptr(eb, path->slots[0],
3149                                               struct btrfs_extent_data_ref);
3150                         ret = find_data_references(rc, extent_key,
3151                                                    eb, dref, blocks);
3152                 } else {
3153                         ret = 0;
3154                 }
3155                 if (ret) {
3156                         err = ret;
3157                         break;
3158                 }
3159                 path->slots[0]++;
3160         }
3161         btrfs_release_path(rc->extent_root, path);
3162         if (err)
3163                 free_block_list(blocks);
3164         return err;
3165 }
3166
3167 /*
3168  * hepler to find next unprocessed extent
3169  */
3170 static noinline_for_stack
3171 int find_next_extent(struct btrfs_trans_handle *trans,
3172                      struct reloc_control *rc, struct btrfs_path *path)
3173 {
3174         struct btrfs_key key;
3175         struct extent_buffer *leaf;
3176         u64 start, end, last;
3177         int ret;
3178
3179         last = rc->block_group->key.objectid + rc->block_group->key.offset;
3180         while (1) {
3181                 cond_resched();
3182                 if (rc->search_start >= last) {
3183                         ret = 1;
3184                         break;
3185                 }
3186
3187                 key.objectid = rc->search_start;
3188                 key.type = BTRFS_EXTENT_ITEM_KEY;
3189                 key.offset = 0;
3190
3191                 path->search_commit_root = 1;
3192                 path->skip_locking = 1;
3193                 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3194                                         0, 0);
3195                 if (ret < 0)
3196                         break;
3197 next:
3198                 leaf = path->nodes[0];
3199                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3200                         ret = btrfs_next_leaf(rc->extent_root, path);
3201                         if (ret != 0)
3202                                 break;
3203                         leaf = path->nodes[0];
3204                 }
3205
3206                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3207                 if (key.objectid >= last) {
3208                         ret = 1;
3209                         break;
3210                 }
3211
3212                 if (key.type != BTRFS_EXTENT_ITEM_KEY ||
3213                     key.objectid + key.offset <= rc->search_start) {
3214                         path->slots[0]++;
3215                         goto next;
3216                 }
3217
3218                 ret = find_first_extent_bit(&rc->processed_blocks,
3219                                             key.objectid, &start, &end,
3220                                             EXTENT_DIRTY);
3221
3222                 if (ret == 0 && start <= key.objectid) {
3223                         btrfs_release_path(rc->extent_root, path);
3224                         rc->search_start = end + 1;
3225                 } else {
3226                         rc->search_start = key.objectid + key.offset;
3227                         return 0;
3228                 }
3229         }
3230         btrfs_release_path(rc->extent_root, path);
3231         return ret;
3232 }
3233
3234 static void set_reloc_control(struct reloc_control *rc)
3235 {
3236         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3237         mutex_lock(&fs_info->trans_mutex);
3238         fs_info->reloc_ctl = rc;
3239         mutex_unlock(&fs_info->trans_mutex);
3240 }
3241
3242 static void unset_reloc_control(struct reloc_control *rc)
3243 {
3244         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3245         mutex_lock(&fs_info->trans_mutex);
3246         fs_info->reloc_ctl = NULL;
3247         mutex_unlock(&fs_info->trans_mutex);
3248 }
3249
3250 static int check_extent_flags(u64 flags)
3251 {
3252         if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3253             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3254                 return 1;
3255         if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3256             !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3257                 return 1;
3258         if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3259             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3260                 return 1;
3261         return 0;
3262 }
3263
3264
3265 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3266 {
3267         struct rb_root blocks = RB_ROOT;
3268         struct btrfs_key key;
3269         struct file_extent_cluster *cluster;
3270         struct btrfs_trans_handle *trans = NULL;
3271         struct btrfs_path *path;
3272         struct btrfs_extent_item *ei;
3273         unsigned long nr;
3274         u64 flags;
3275         u32 item_size;
3276         int ret;
3277         int err = 0;
3278
3279         cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3280         if (!cluster)
3281                 return -ENOMEM;
3282
3283         path = btrfs_alloc_path();
3284         if (!path) {
3285                 kfree(cluster);
3286                 return -ENOMEM;
3287         }
3288
3289         rc->extents_found = 0;
3290         rc->extents_skipped = 0;
3291
3292         rc->search_start = rc->block_group->key.objectid;
3293         clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3294                           GFP_NOFS);
3295
3296         rc->create_reloc_root = 1;
3297         set_reloc_control(rc);
3298
3299         trans = btrfs_start_transaction(rc->extent_root, 1);
3300         btrfs_commit_transaction(trans, rc->extent_root);
3301
3302         while (1) {
3303                 trans = btrfs_start_transaction(rc->extent_root, 1);
3304
3305                 ret = find_next_extent(trans, rc, path);
3306                 if (ret < 0)
3307                         err = ret;
3308                 if (ret != 0)
3309                         break;
3310
3311                 rc->extents_found++;
3312
3313                 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3314                                     struct btrfs_extent_item);
3315                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3316                 item_size = btrfs_item_size_nr(path->nodes[0],
3317                                                path->slots[0]);
3318                 if (item_size >= sizeof(*ei)) {
3319                         flags = btrfs_extent_flags(path->nodes[0], ei);
3320                         ret = check_extent_flags(flags);
3321                         BUG_ON(ret);
3322
3323                 } else {
3324 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3325                         u64 ref_owner;
3326                         int path_change = 0;
3327
3328                         BUG_ON(item_size !=
3329                                sizeof(struct btrfs_extent_item_v0));
3330                         ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3331                                                   &path_change);
3332                         if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3333                                 flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3334                         else
3335                                 flags = BTRFS_EXTENT_FLAG_DATA;
3336
3337                         if (path_change) {
3338                                 btrfs_release_path(rc->extent_root, path);
3339
3340                                 path->search_commit_root = 1;
3341                                 path->skip_locking = 1;
3342                                 ret = btrfs_search_slot(NULL, rc->extent_root,
3343                                                         &key, path, 0, 0);
3344                                 if (ret < 0) {
3345                                         err = ret;
3346                                         break;
3347                                 }
3348                                 BUG_ON(ret > 0);
3349                         }
3350 #else
3351                         BUG();
3352 #endif
3353                 }
3354
3355                 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3356                         ret = add_tree_block(rc, &key, path, &blocks);
3357                 } else if (rc->stage == UPDATE_DATA_PTRS &&
3358                          (flags & BTRFS_EXTENT_FLAG_DATA)) {
3359                         ret = add_data_references(rc, &key, path, &blocks);
3360                 } else {
3361                         btrfs_release_path(rc->extent_root, path);
3362                         ret = 0;
3363                 }
3364                 if (ret < 0) {
3365                         err = 0;
3366                         break;
3367                 }
3368
3369                 if (!RB_EMPTY_ROOT(&blocks)) {
3370                         ret = relocate_tree_blocks(trans, rc, &blocks);
3371                         if (ret < 0) {
3372                                 err = ret;
3373                                 break;
3374                         }
3375                 }
3376
3377                 nr = trans->blocks_used;
3378                 btrfs_end_transaction(trans, rc->extent_root);
3379                 trans = NULL;
3380                 btrfs_btree_balance_dirty(rc->extent_root, nr);
3381
3382                 if (rc->stage == MOVE_DATA_EXTENTS &&
3383                     (flags & BTRFS_EXTENT_FLAG_DATA)) {
3384                         rc->found_file_extent = 1;
3385                         ret = relocate_data_extent(rc->data_inode,
3386                                                    &key, cluster);
3387                         if (ret < 0) {
3388                                 err = ret;
3389                                 break;
3390                         }
3391                 }
3392         }
3393         btrfs_free_path(path);
3394
3395         if (trans) {
3396                 nr = trans->blocks_used;
3397                 btrfs_end_transaction(trans, rc->extent_root);
3398                 btrfs_btree_balance_dirty(rc->extent_root, nr);
3399         }
3400
3401         if (!err) {
3402                 ret = relocate_file_extent_cluster(rc->data_inode, cluster);
3403                 if (ret < 0)
3404                         err = ret;
3405         }
3406
3407         kfree(cluster);
3408
3409         rc->create_reloc_root = 0;
3410         smp_mb();
3411
3412         if (rc->extents_found > 0) {
3413                 trans = btrfs_start_transaction(rc->extent_root, 1);
3414                 btrfs_commit_transaction(trans, rc->extent_root);
3415         }
3416
3417         merge_reloc_roots(rc);
3418
3419         unset_reloc_control(rc);
3420
3421         /* get rid of pinned extents */
3422         trans = btrfs_start_transaction(rc->extent_root, 1);
3423         btrfs_commit_transaction(trans, rc->extent_root);
3424
3425         return err;
3426 }
3427
3428 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3429                                  struct btrfs_root *root, u64 objectid)
3430 {
3431         struct btrfs_path *path;
3432         struct btrfs_inode_item *item;
3433         struct extent_buffer *leaf;
3434         int ret;
3435
3436         path = btrfs_alloc_path();
3437         if (!path)
3438                 return -ENOMEM;
3439
3440         ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3441         if (ret)
3442                 goto out;
3443
3444         leaf = path->nodes[0];
3445         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3446         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
3447         btrfs_set_inode_generation(leaf, item, 1);
3448         btrfs_set_inode_size(leaf, item, 0);
3449         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3450         btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
3451         btrfs_mark_buffer_dirty(leaf);
3452         btrfs_release_path(root, path);
3453 out:
3454         btrfs_free_path(path);
3455         return ret;
3456 }
3457
3458 /*
3459  * helper to create inode for data relocation.
3460  * the inode is in data relocation tree and its link count is 0
3461  */
3462 static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3463                                         struct btrfs_block_group_cache *group)
3464 {
3465         struct inode *inode = NULL;
3466         struct btrfs_trans_handle *trans;
3467         struct btrfs_root *root;
3468         struct btrfs_key key;
3469         unsigned long nr;
3470         u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3471         int err = 0;
3472
3473         root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
3474         if (IS_ERR(root))
3475                 return ERR_CAST(root);
3476
3477         trans = btrfs_start_transaction(root, 1);
3478         BUG_ON(!trans);
3479
3480         err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3481         if (err)
3482                 goto out;
3483
3484         err = __insert_orphan_inode(trans, root, objectid);
3485         BUG_ON(err);
3486
3487         key.objectid = objectid;
3488         key.type = BTRFS_INODE_ITEM_KEY;
3489         key.offset = 0;
3490         inode = btrfs_iget(root->fs_info->sb, &key, root);
3491         BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3492         BTRFS_I(inode)->index_cnt = group->key.objectid;
3493
3494         err = btrfs_orphan_add(trans, inode);
3495 out:
3496         nr = trans->blocks_used;
3497         btrfs_end_transaction(trans, root);
3498
3499         btrfs_btree_balance_dirty(root, nr);
3500         if (err) {
3501                 if (inode)
3502                         iput(inode);
3503                 inode = ERR_PTR(err);
3504         }
3505         return inode;
3506 }
3507
3508 /*
3509  * function to relocate all extents in a block group.
3510  */
3511 int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3512 {
3513         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3514         struct reloc_control *rc;
3515         int ret;
3516         int err = 0;
3517
3518         rc = kzalloc(sizeof(*rc), GFP_NOFS);
3519         if (!rc)
3520                 return -ENOMEM;
3521
3522         mapping_tree_init(&rc->reloc_root_tree);
3523         extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3524         INIT_LIST_HEAD(&rc->reloc_roots);
3525
3526         rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3527         BUG_ON(!rc->block_group);
3528
3529         btrfs_init_workers(&rc->workers, "relocate",
3530                            fs_info->thread_pool_size, NULL);
3531
3532         rc->extent_root = extent_root;
3533         btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
3534
3535         rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3536         if (IS_ERR(rc->data_inode)) {
3537                 err = PTR_ERR(rc->data_inode);
3538                 rc->data_inode = NULL;
3539                 goto out;
3540         }
3541
3542         printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
3543                (unsigned long long)rc->block_group->key.objectid,
3544                (unsigned long long)rc->block_group->flags);
3545
3546         btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
3547         btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3548
3549         while (1) {
3550                 rc->extents_found = 0;
3551                 rc->extents_skipped = 0;
3552
3553                 mutex_lock(&fs_info->cleaner_mutex);
3554
3555                 btrfs_clean_old_snapshots(fs_info->tree_root);
3556                 ret = relocate_block_group(rc);
3557
3558                 mutex_unlock(&fs_info->cleaner_mutex);
3559                 if (ret < 0) {
3560                         err = ret;
3561                         break;
3562                 }
3563
3564                 if (rc->extents_found == 0)
3565                         break;
3566
3567                 printk(KERN_INFO "btrfs: found %llu extents\n",
3568                         (unsigned long long)rc->extents_found);
3569
3570                 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
3571                         btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
3572                         invalidate_mapping_pages(rc->data_inode->i_mapping,
3573                                                  0, -1);
3574                         rc->stage = UPDATE_DATA_PTRS;
3575                 } else if (rc->stage == UPDATE_DATA_PTRS &&
3576                            rc->extents_skipped >= rc->extents_found) {
3577                         iput(rc->data_inode);
3578                         rc->data_inode = create_reloc_inode(fs_info,
3579                                                             rc->block_group);
3580                         if (IS_ERR(rc->data_inode)) {
3581                                 err = PTR_ERR(rc->data_inode);
3582                                 rc->data_inode = NULL;
3583                                 break;
3584                         }
3585                         rc->stage = MOVE_DATA_EXTENTS;
3586                         rc->found_file_extent = 0;
3587                 }
3588         }
3589
3590         filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
3591                                      rc->block_group->key.objectid,
3592                                      rc->block_group->key.objectid +
3593                                      rc->block_group->key.offset - 1);
3594
3595         WARN_ON(rc->block_group->pinned > 0);
3596         WARN_ON(rc->block_group->reserved > 0);
3597         WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3598 out:
3599         iput(rc->data_inode);
3600         btrfs_stop_workers(&rc->workers);
3601         btrfs_put_block_group(rc->block_group);
3602         kfree(rc);
3603         return err;
3604 }
3605
3606 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3607 {
3608         struct btrfs_trans_handle *trans;
3609         int ret;
3610
3611         trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
3612
3613         memset(&root->root_item.drop_progress, 0,
3614                 sizeof(root->root_item.drop_progress));
3615         root->root_item.drop_level = 0;
3616         btrfs_set_root_refs(&root->root_item, 0);
3617         ret = btrfs_update_root(trans, root->fs_info->tree_root,
3618                                 &root->root_key, &root->root_item);
3619         BUG_ON(ret);
3620
3621         ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
3622         BUG_ON(ret);
3623         return 0;
3624 }
3625
3626 /*
3627  * recover relocation interrupted by system crash.
3628  *
3629  * this function resumes merging reloc trees with corresponding fs trees.
3630  * this is important for keeping the sharing of tree blocks
3631  */
3632 int btrfs_recover_relocation(struct btrfs_root *root)
3633 {
3634         LIST_HEAD(reloc_roots);
3635         struct btrfs_key key;
3636         struct btrfs_root *fs_root;
3637         struct btrfs_root *reloc_root;
3638         struct btrfs_path *path;
3639         struct extent_buffer *leaf;
3640         struct reloc_control *rc = NULL;
3641         struct btrfs_trans_handle *trans;
3642         int ret;
3643         int err = 0;
3644
3645         path = btrfs_alloc_path();
3646         if (!path)
3647                 return -ENOMEM;
3648
3649         key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3650         key.type = BTRFS_ROOT_ITEM_KEY;
3651         key.offset = (u64)-1;
3652
3653         while (1) {
3654                 ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
3655                                         path, 0, 0);
3656                 if (ret < 0) {
3657                         err = ret;
3658                         goto out;
3659                 }
3660                 if (ret > 0) {
3661                         if (path->slots[0] == 0)
3662                                 break;
3663                         path->slots[0]--;
3664                 }
3665                 leaf = path->nodes[0];
3666                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3667                 btrfs_release_path(root->fs_info->tree_root, path);
3668
3669                 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
3670                     key.type != BTRFS_ROOT_ITEM_KEY)
3671                         break;
3672
3673                 reloc_root = btrfs_read_fs_root_no_radix(root, &key);
3674                 if (IS_ERR(reloc_root)) {
3675                         err = PTR_ERR(reloc_root);
3676                         goto out;
3677                 }
3678
3679                 list_add(&reloc_root->root_list, &reloc_roots);
3680
3681                 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
3682                         fs_root = read_fs_root(root->fs_info,
3683                                                reloc_root->root_key.offset);
3684                         if (IS_ERR(fs_root)) {
3685                                 ret = PTR_ERR(fs_root);
3686                                 if (ret != -ENOENT) {
3687                                         err = ret;
3688                                         goto out;
3689                                 }
3690                                 mark_garbage_root(reloc_root);
3691                         }
3692                 }
3693
3694                 if (key.offset == 0)
3695                         break;
3696
3697                 key.offset--;
3698         }
3699         btrfs_release_path(root->fs_info->tree_root, path);
3700
3701         if (list_empty(&reloc_roots))
3702                 goto out;
3703
3704         rc = kzalloc(sizeof(*rc), GFP_NOFS);
3705         if (!rc) {
3706                 err = -ENOMEM;
3707                 goto out;
3708         }
3709
3710         mapping_tree_init(&rc->reloc_root_tree);
3711         INIT_LIST_HEAD(&rc->reloc_roots);
3712         btrfs_init_workers(&rc->workers, "relocate",
3713                            root->fs_info->thread_pool_size, NULL);
3714         rc->extent_root = root->fs_info->extent_root;
3715
3716         set_reloc_control(rc);
3717
3718         while (!list_empty(&reloc_roots)) {
3719                 reloc_root = list_entry(reloc_roots.next,
3720                                         struct btrfs_root, root_list);
3721                 list_del(&reloc_root->root_list);
3722
3723                 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
3724                         list_add_tail(&reloc_root->root_list,
3725                                       &rc->reloc_roots);
3726                         continue;
3727                 }
3728
3729                 fs_root = read_fs_root(root->fs_info,
3730                                        reloc_root->root_key.offset);
3731                 BUG_ON(IS_ERR(fs_root));
3732
3733                 __add_reloc_root(reloc_root);
3734                 fs_root->reloc_root = reloc_root;
3735         }
3736
3737         trans = btrfs_start_transaction(rc->extent_root, 1);
3738         btrfs_commit_transaction(trans, rc->extent_root);
3739
3740         merge_reloc_roots(rc);
3741
3742         unset_reloc_control(rc);
3743
3744         trans = btrfs_start_transaction(rc->extent_root, 1);
3745         btrfs_commit_transaction(trans, rc->extent_root);
3746 out:
3747         if (rc) {
3748                 btrfs_stop_workers(&rc->workers);
3749                 kfree(rc);
3750         }
3751         while (!list_empty(&reloc_roots)) {
3752                 reloc_root = list_entry(reloc_roots.next,
3753                                         struct btrfs_root, root_list);
3754                 list_del(&reloc_root->root_list);
3755                 free_extent_buffer(reloc_root->node);
3756                 free_extent_buffer(reloc_root->commit_root);
3757                 kfree(reloc_root);
3758         }
3759         btrfs_free_path(path);
3760
3761         if (err == 0) {
3762                 /* cleanup orphan inode in data relocation tree */
3763                 fs_root = read_fs_root(root->fs_info,
3764                                        BTRFS_DATA_RELOC_TREE_OBJECTID);
3765                 if (IS_ERR(fs_root))
3766                         err = PTR_ERR(fs_root);
3767                 else
3768                         btrfs_orphan_cleanup(fs_root);
3769         }
3770         return err;
3771 }
3772
3773 /*
3774  * helper to add ordered checksum for data relocation.
3775  *
3776  * cloning checksum properly handles the nodatasum extents.
3777  * it also saves CPU time to re-calculate the checksum.
3778  */
3779 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3780 {
3781         struct btrfs_ordered_sum *sums;
3782         struct btrfs_sector_sum *sector_sum;
3783         struct btrfs_ordered_extent *ordered;
3784         struct btrfs_root *root = BTRFS_I(inode)->root;
3785         size_t offset;
3786         int ret;
3787         u64 disk_bytenr;
3788         LIST_HEAD(list);
3789
3790         ordered = btrfs_lookup_ordered_extent(inode, file_pos);
3791         BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
3792
3793         disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
3794         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
3795                                        disk_bytenr + len - 1, &list);
3796
3797         while (!list_empty(&list)) {
3798                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
3799                 list_del_init(&sums->list);
3800
3801                 sector_sum = sums->sums;
3802                 sums->bytenr = ordered->start;
3803
3804                 offset = 0;
3805                 while (offset < sums->len) {
3806                         sector_sum->bytenr += ordered->start - disk_bytenr;
3807                         sector_sum++;
3808                         offset += root->sectorsize;
3809                 }
3810
3811                 btrfs_add_ordered_sum(inode, ordered, sums);
3812         }
3813         btrfs_put_ordered_extent(ordered);
3814         return 0;
3815 }