Btrfs: Add readahead to the online shrinker, and a mount -o alloc_start= for testing
[pandora-kernel.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/crc32c.h>
21 #include <linux/pagemap.h>
22 #include "hash.h"
23 #include "ctree.h"
24 #include "disk-io.h"
25 #include "print-tree.h"
26 #include "transaction.h"
27
28 #define BLOCK_GROUP_DATA EXTENT_WRITEBACK
29 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
30 #define BLOCK_GROUP_DIRTY EXTENT_DIRTY
31
32 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
33                                  btrfs_root *extent_root);
34 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
35                                btrfs_root *extent_root);
36
37 static int cache_block_group(struct btrfs_root *root,
38                              struct btrfs_block_group_cache *block_group)
39 {
40         struct btrfs_path *path;
41         int ret;
42         struct btrfs_key key;
43         struct extent_buffer *leaf;
44         struct extent_map_tree *free_space_cache;
45         int slot;
46         u64 last = 0;
47         u64 hole_size;
48         u64 first_free;
49         int found = 0;
50
51         if (!block_group)
52                 return 0;
53
54         root = root->fs_info->extent_root;
55         free_space_cache = &root->fs_info->free_space_cache;
56
57         if (block_group->cached)
58                 return 0;
59
60         path = btrfs_alloc_path();
61         if (!path)
62                 return -ENOMEM;
63
64         path->reada = 2;
65         first_free = block_group->key.objectid;
66         key.objectid = block_group->key.objectid;
67         key.offset = 0;
68
69         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
70         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
71
72         if (ret < 0)
73                 return ret;
74
75         if (ret && path->slots[0] > 0)
76                 path->slots[0]--;
77
78         while(1) {
79                 leaf = path->nodes[0];
80                 slot = path->slots[0];
81                 if (slot >= btrfs_header_nritems(leaf)) {
82                         ret = btrfs_next_leaf(root, path);
83                         if (ret < 0)
84                                 goto err;
85                         if (ret == 0) {
86                                 continue;
87                         } else {
88                                 break;
89                         }
90                 }
91
92                 btrfs_item_key_to_cpu(leaf, &key, slot);
93                 if (key.objectid < block_group->key.objectid) {
94                         if (btrfs_key_type(&key) != BTRFS_EXTENT_REF_KEY &&
95                             key.objectid + key.offset > first_free)
96                                 first_free = key.objectid + key.offset;
97                         goto next;
98                 }
99
100                 if (key.objectid >= block_group->key.objectid +
101                     block_group->key.offset) {
102                         break;
103                 }
104
105                 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
106                         if (!found) {
107                                 last = first_free;
108                                 found = 1;
109                         }
110                         if (key.objectid > last) {
111                                 hole_size = key.objectid - last;
112                                 set_extent_dirty(free_space_cache, last,
113                                                  last + hole_size - 1,
114                                                  GFP_NOFS);
115                         }
116                         last = key.objectid + key.offset;
117                 }
118 next:
119                 path->slots[0]++;
120         }
121
122         if (!found)
123                 last = first_free;
124         if (block_group->key.objectid +
125             block_group->key.offset > last) {
126                 hole_size = block_group->key.objectid +
127                         block_group->key.offset - last;
128                 set_extent_dirty(free_space_cache, last,
129                                  last + hole_size - 1, GFP_NOFS);
130         }
131         block_group->cached = 1;
132 err:
133         btrfs_free_path(path);
134         return 0;
135 }
136
137 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
138                                                          btrfs_fs_info *info,
139                                                          u64 bytenr)
140 {
141         struct extent_map_tree *block_group_cache;
142         struct btrfs_block_group_cache *block_group = NULL;
143         u64 ptr;
144         u64 start;
145         u64 end;
146         int ret;
147
148         block_group_cache = &info->block_group_cache;
149         ret = find_first_extent_bit(block_group_cache,
150                                     bytenr, &start, &end,
151                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA);
152         if (ret) {
153                 return NULL;
154         }
155         ret = get_state_private(block_group_cache, start, &ptr);
156         if (ret)
157                 return NULL;
158
159         block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
160         if (block_group->key.objectid <= bytenr && bytenr <
161             block_group->key.objectid + block_group->key.offset)
162                 return block_group;
163         return NULL;
164 }
165 static u64 find_search_start(struct btrfs_root *root,
166                              struct btrfs_block_group_cache **cache_ret,
167                              u64 search_start, int num,
168                              int data, int full_scan)
169 {
170         int ret;
171         struct btrfs_block_group_cache *cache = *cache_ret;
172         u64 last;
173         u64 start = 0;
174         u64 end = 0;
175         u64 cache_miss = 0;
176         int wrapped = 0;
177
178         if (!cache) {
179                 goto out;
180         }
181 again:
182         ret = cache_block_group(root, cache);
183         if (ret)
184                 goto out;
185
186         last = max(search_start, cache->key.objectid);
187
188         while(1) {
189                 ret = find_first_extent_bit(&root->fs_info->free_space_cache,
190                                             last, &start, &end, EXTENT_DIRTY);
191                 if (ret) {
192                         if (!cache_miss)
193                                 cache_miss = last;
194                         goto new_group;
195                 }
196
197                 start = max(last, start);
198                 last = end + 1;
199                 if (last - start < num) {
200                         if (last == cache->key.objectid + cache->key.offset)
201                                 cache_miss = start;
202                         continue;
203                 }
204                 if (data != BTRFS_BLOCK_GROUP_MIXED &&
205                     start + num > cache->key.objectid + cache->key.offset)
206                         goto new_group;
207                 return start;
208         }
209 out:
210         cache = btrfs_lookup_block_group(root->fs_info, search_start);
211         if (!cache) {
212                 printk("Unable to find block group for %Lu\n",
213                        search_start);
214                 WARN_ON(1);
215                 return search_start;
216         }
217         return search_start;
218
219 new_group:
220         last = cache->key.objectid + cache->key.offset;
221 wrapped:
222         cache = btrfs_lookup_block_group(root->fs_info, last);
223         if (!cache) {
224 no_cache:
225                 if (!wrapped) {
226                         wrapped = 1;
227                         last = search_start;
228                         data = BTRFS_BLOCK_GROUP_MIXED;
229                         goto wrapped;
230                 }
231                 goto out;
232         }
233         if (cache_miss && !cache->cached) {
234                 cache_block_group(root, cache);
235                 last = cache_miss;
236                 cache = btrfs_lookup_block_group(root->fs_info, last);
237         }
238         cache = btrfs_find_block_group(root, cache, last, data, 0);
239         if (!cache)
240                 goto no_cache;
241         *cache_ret = cache;
242         cache_miss = 0;
243         goto again;
244 }
245
246 static u64 div_factor(u64 num, int factor)
247 {
248         if (factor == 10)
249                 return num;
250         num *= factor;
251         do_div(num, 10);
252         return num;
253 }
254
255 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
256                                                  struct btrfs_block_group_cache
257                                                  *hint, u64 search_start,
258                                                  int data, int owner)
259 {
260         struct btrfs_block_group_cache *cache;
261         struct extent_map_tree *block_group_cache;
262         struct btrfs_block_group_cache *found_group = NULL;
263         struct btrfs_fs_info *info = root->fs_info;
264         u64 used;
265         u64 last = 0;
266         u64 hint_last;
267         u64 start;
268         u64 end;
269         u64 free_check;
270         u64 ptr;
271         int bit;
272         int ret;
273         int full_search = 0;
274         int factor = 8;
275         int data_swap = 0;
276
277         block_group_cache = &info->block_group_cache;
278
279         if (!owner)
280                 factor = 8;
281
282         if (data == BTRFS_BLOCK_GROUP_MIXED) {
283                 bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA;
284                 factor = 10;
285         } else if (data)
286                 bit = BLOCK_GROUP_DATA;
287         else
288                 bit = BLOCK_GROUP_METADATA;
289
290         if (search_start) {
291                 struct btrfs_block_group_cache *shint;
292                 shint = btrfs_lookup_block_group(info, search_start);
293                 if (shint && (shint->data == data ||
294                               shint->data == BTRFS_BLOCK_GROUP_MIXED)) {
295                         used = btrfs_block_group_used(&shint->item);
296                         if (used + shint->pinned <
297                             div_factor(shint->key.offset, factor)) {
298                                 return shint;
299                         }
300                 }
301         }
302         if (hint && (hint->data == data ||
303                      hint->data == BTRFS_BLOCK_GROUP_MIXED)) {
304                 used = btrfs_block_group_used(&hint->item);
305                 if (used + hint->pinned <
306                     div_factor(hint->key.offset, factor)) {
307                         return hint;
308                 }
309                 last = hint->key.objectid + hint->key.offset;
310                 hint_last = last;
311         } else {
312                 if (hint)
313                         hint_last = max(hint->key.objectid, search_start);
314                 else
315                         hint_last = search_start;
316
317                 last = hint_last;
318         }
319 again:
320         while(1) {
321                 ret = find_first_extent_bit(block_group_cache, last,
322                                             &start, &end, bit);
323                 if (ret)
324                         break;
325
326                 ret = get_state_private(block_group_cache, start, &ptr);
327                 if (ret)
328                         break;
329
330                 cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
331                 last = cache->key.objectid + cache->key.offset;
332                 used = btrfs_block_group_used(&cache->item);
333
334                 if (full_search)
335                         free_check = cache->key.offset;
336                 else
337                         free_check = div_factor(cache->key.offset, factor);
338                 if (used + cache->pinned < free_check) {
339                         found_group = cache;
340                         goto found;
341                 }
342                 cond_resched();
343         }
344         if (!full_search) {
345                 last = search_start;
346                 full_search = 1;
347                 goto again;
348         }
349         if (!data_swap) {
350                 data_swap = 1;
351                 bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA;
352                 last = search_start;
353                 goto again;
354         }
355 found:
356         return found_group;
357 }
358
359 static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
360                            u64 owner, u64 owner_offset)
361 {
362         u32 high_crc = ~(u32)0;
363         u32 low_crc = ~(u32)0;
364         __le64 lenum;
365
366         lenum = cpu_to_le64(root_objectid);
367         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
368         lenum = cpu_to_le64(ref_generation);
369         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
370
371 #if 0
372         lenum = cpu_to_le64(owner);
373         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
374         lenum = cpu_to_le64(owner_offset);
375         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
376 #endif
377         return ((u64)high_crc << 32) | (u64)low_crc;
378 }
379
380 static int match_extent_ref(struct extent_buffer *leaf,
381                             struct btrfs_extent_ref *disk_ref,
382                             struct btrfs_extent_ref *cpu_ref)
383 {
384         int ret;
385         int len;
386
387         if (cpu_ref->objectid)
388                 len = sizeof(*cpu_ref);
389         else
390                 len = 2 * sizeof(u64);
391         ret = memcmp_extent_buffer(leaf, cpu_ref, (unsigned long)disk_ref,
392                                    len);
393         return ret == 0;
394 }
395
396 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
397                                  struct btrfs_root *root,
398                                  struct btrfs_path *path, u64 bytenr,
399                                  u64 root_objectid, u64 ref_generation,
400                                  u64 owner, u64 owner_offset, int del)
401 {
402         u64 hash;
403         struct btrfs_key key;
404         struct btrfs_key found_key;
405         struct btrfs_extent_ref ref;
406         struct extent_buffer *leaf;
407         struct btrfs_extent_ref *disk_ref;
408         int ret;
409         int ret2;
410
411         btrfs_set_stack_ref_root(&ref, root_objectid);
412         btrfs_set_stack_ref_generation(&ref, ref_generation);
413         btrfs_set_stack_ref_objectid(&ref, owner);
414         btrfs_set_stack_ref_offset(&ref, owner_offset);
415
416         hash = hash_extent_ref(root_objectid, ref_generation, owner,
417                                owner_offset);
418         key.offset = hash;
419         key.objectid = bytenr;
420         key.type = BTRFS_EXTENT_REF_KEY;
421
422         while (1) {
423                 ret = btrfs_search_slot(trans, root, &key, path,
424                                         del ? -1 : 0, del);
425                 if (ret < 0)
426                         goto out;
427                 leaf = path->nodes[0];
428                 if (ret != 0) {
429                         u32 nritems = btrfs_header_nritems(leaf);
430                         if (path->slots[0] >= nritems) {
431                                 ret2 = btrfs_next_leaf(root, path);
432                                 if (ret2)
433                                         goto out;
434                                 leaf = path->nodes[0];
435                         }
436                         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
437                         if (found_key.objectid != bytenr ||
438                             found_key.type != BTRFS_EXTENT_REF_KEY)
439                                 goto out;
440                         key.offset = found_key.offset;
441                         if (del) {
442                                 btrfs_release_path(root, path);
443                                 continue;
444                         }
445                 }
446                 disk_ref = btrfs_item_ptr(path->nodes[0],
447                                           path->slots[0],
448                                           struct btrfs_extent_ref);
449                 if (match_extent_ref(path->nodes[0], disk_ref, &ref)) {
450                         ret = 0;
451                         goto out;
452                 }
453                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
454                 key.offset = found_key.offset + 1;
455                 btrfs_release_path(root, path);
456         }
457 out:
458         return ret;
459 }
460
461 /*
462  * Back reference rules.  Back refs have three main goals:
463  *
464  * 1) differentiate between all holders of references to an extent so that
465  *    when a reference is dropped we can make sure it was a valid reference
466  *    before freeing the extent.
467  *
468  * 2) Provide enough information to quickly find the holders of an extent
469  *    if we notice a given block is corrupted or bad.
470  *
471  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
472  *    maintenance.  This is actually the same as #2, but with a slightly
473  *    different use case.
474  *
475  * File extents can be referenced by:
476  *
477  * - multiple snapshots, subvolumes, or different generations in one subvol
478  * - different files inside a single subvolume (in theory, not implemented yet)
479  * - different offsets inside a file (bookend extents in file.c)
480  *
481  * The extent ref structure has fields for:
482  *
483  * - Objectid of the subvolume root
484  * - Generation number of the tree holding the reference
485  * - objectid of the file holding the reference
486  * - offset in the file corresponding to the key holding the reference
487  *
488  * When a file extent is allocated the fields are filled in:
489  *     (root_key.objectid, trans->transid, inode objectid, offset in file)
490  *
491  * When a leaf is cow'd new references are added for every file extent found
492  * in the leaf.  It looks the same as the create case, but trans->transid
493  * will be different when the block is cow'd.
494  *
495  *     (root_key.objectid, trans->transid, inode objectid, offset in file)
496  *
497  * When a file extent is removed either during snapshot deletion or file
498  * truncation, the corresponding back reference is found
499  * by searching for:
500  *
501  *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
502  *      inode objectid, offset in file)
503  *
504  * Btree extents can be referenced by:
505  *
506  * - Different subvolumes
507  * - Different generations of the same subvolume
508  *
509  * Storing sufficient information for a full reverse mapping of a btree
510  * block would require storing the lowest key of the block in the backref,
511  * and it would require updating that lowest key either before write out or
512  * every time it changed.  Instead, the objectid of the lowest key is stored
513  * along with the level of the tree block.  This provides a hint
514  * about where in the btree the block can be found.  Searches through the
515  * btree only need to look for a pointer to that block, so they stop one
516  * level higher than the level recorded in the backref.
517  *
518  * Some btrees do not do reference counting on their extents.  These
519  * include the extent tree and the tree of tree roots.  Backrefs for these
520  * trees always have a generation of zero.
521  *
522  * When a tree block is created, back references are inserted:
523  *
524  * (root->root_key.objectid, trans->transid or zero, level, lowest_key_objectid)
525  *
526  * When a tree block is cow'd in a reference counted root,
527  * new back references are added for all the blocks it points to.
528  * These are of the form (trans->transid will have increased since creation):
529  *
530  * (root->root_key.objectid, trans->transid, level, lowest_key_objectid)
531  *
532  * Because the lowest_key_objectid and the level are just hints
533  * they are not used when backrefs are deleted.  When a backref is deleted:
534  *
535  * if backref was for a tree root:
536  *     root_objectid = root->root_key.objectid
537  * else
538  *     root_objectid = btrfs_header_owner(parent)
539  *
540  * (root_objectid, btrfs_header_generation(parent) or zero, 0, 0)
541  *
542  * Back Reference Key hashing:
543  *
544  * Back references have four fields, each 64 bits long.  Unfortunately,
545  * This is hashed into a single 64 bit number and placed into the key offset.
546  * The key objectid corresponds to the first byte in the extent, and the
547  * key type is set to BTRFS_EXTENT_REF_KEY
548  */
549 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
550                                  struct btrfs_root *root,
551                                  struct btrfs_path *path, u64 bytenr,
552                                  u64 root_objectid, u64 ref_generation,
553                                  u64 owner, u64 owner_offset)
554 {
555         u64 hash;
556         struct btrfs_key key;
557         struct btrfs_extent_ref ref;
558         struct btrfs_extent_ref *disk_ref;
559         int ret;
560
561         btrfs_set_stack_ref_root(&ref, root_objectid);
562         btrfs_set_stack_ref_generation(&ref, ref_generation);
563         btrfs_set_stack_ref_objectid(&ref, owner);
564         btrfs_set_stack_ref_offset(&ref, owner_offset);
565
566         hash = hash_extent_ref(root_objectid, ref_generation, owner,
567                                owner_offset);
568         key.offset = hash;
569         key.objectid = bytenr;
570         key.type = BTRFS_EXTENT_REF_KEY;
571
572         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(ref));
573         while (ret == -EEXIST) {
574                 disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
575                                           struct btrfs_extent_ref);
576                 if (match_extent_ref(path->nodes[0], disk_ref, &ref))
577                         goto out;
578                 key.offset++;
579                 btrfs_release_path(root, path);
580                 ret = btrfs_insert_empty_item(trans, root, path, &key,
581                                               sizeof(ref));
582         }
583         if (ret)
584                 goto out;
585         disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
586                                   struct btrfs_extent_ref);
587         write_extent_buffer(path->nodes[0], &ref, (unsigned long)disk_ref,
588                             sizeof(ref));
589         btrfs_mark_buffer_dirty(path->nodes[0]);
590 out:
591         btrfs_release_path(root, path);
592         return ret;
593 }
594
595 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
596                                 struct btrfs_root *root,
597                                 u64 bytenr, u64 num_bytes,
598                                 u64 root_objectid, u64 ref_generation,
599                                 u64 owner, u64 owner_offset)
600 {
601         struct btrfs_path *path;
602         int ret;
603         struct btrfs_key key;
604         struct extent_buffer *l;
605         struct btrfs_extent_item *item;
606         u32 refs;
607
608         WARN_ON(num_bytes < root->sectorsize);
609         path = btrfs_alloc_path();
610         if (!path)
611                 return -ENOMEM;
612
613         key.objectid = bytenr;
614         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
615         key.offset = num_bytes;
616         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
617                                 0, 1);
618         if (ret < 0)
619                 return ret;
620         if (ret != 0) {
621                 BUG();
622         }
623         BUG_ON(ret != 0);
624         l = path->nodes[0];
625         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
626         refs = btrfs_extent_refs(l, item);
627         btrfs_set_extent_refs(l, item, refs + 1);
628         btrfs_mark_buffer_dirty(path->nodes[0]);
629
630         btrfs_release_path(root->fs_info->extent_root, path);
631
632         ret = btrfs_insert_extent_backref(trans, root->fs_info->extent_root,
633                                           path, bytenr, root_objectid,
634                                           ref_generation, owner, owner_offset);
635         BUG_ON(ret);
636         finish_current_insert(trans, root->fs_info->extent_root);
637         del_pending_extents(trans, root->fs_info->extent_root);
638
639         btrfs_free_path(path);
640         return 0;
641 }
642
643 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
644                          struct btrfs_root *root)
645 {
646         finish_current_insert(trans, root->fs_info->extent_root);
647         del_pending_extents(trans, root->fs_info->extent_root);
648         return 0;
649 }
650
651 static int lookup_extent_ref(struct btrfs_trans_handle *trans,
652                              struct btrfs_root *root, u64 bytenr,
653                              u64 num_bytes, u32 *refs)
654 {
655         struct btrfs_path *path;
656         int ret;
657         struct btrfs_key key;
658         struct extent_buffer *l;
659         struct btrfs_extent_item *item;
660
661         WARN_ON(num_bytes < root->sectorsize);
662         path = btrfs_alloc_path();
663         key.objectid = bytenr;
664         key.offset = num_bytes;
665         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
666         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
667                                 0, 0);
668         if (ret < 0)
669                 goto out;
670         if (ret != 0) {
671                 btrfs_print_leaf(root, path->nodes[0]);
672                 printk("failed to find block number %Lu\n", bytenr);
673                 BUG();
674         }
675         l = path->nodes[0];
676         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
677         *refs = btrfs_extent_refs(l, item);
678 out:
679         btrfs_free_path(path);
680         return 0;
681 }
682
683 u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
684                                   struct btrfs_path *count_path,
685                                   u64 first_extent)
686 {
687         struct btrfs_root *extent_root = root->fs_info->extent_root;
688         struct btrfs_path *path;
689         u64 bytenr;
690         u64 found_objectid;
691         u64 root_objectid = 0;
692         u32 total_count = 0;
693         u32 cur_count;
694         u32 refs;
695         u32 nritems;
696         int ret;
697         struct btrfs_key key;
698         struct btrfs_key found_key;
699         struct extent_buffer *l;
700         struct btrfs_extent_item *item;
701         struct btrfs_extent_ref *ref_item;
702         int level = -1;
703
704         path = btrfs_alloc_path();
705 again:
706         if (level == -1)
707                 bytenr = first_extent;
708         else
709                 bytenr = count_path->nodes[level]->start;
710
711         cur_count = 0;
712         key.objectid = bytenr;
713         key.offset = 0;
714
715         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
716         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
717         if (ret < 0)
718                 goto out;
719         BUG_ON(ret == 0);
720
721         l = path->nodes[0];
722         btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
723
724         if (found_key.objectid != bytenr ||
725             found_key.type != BTRFS_EXTENT_ITEM_KEY) {
726                 goto out;
727         }
728
729         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
730         refs = btrfs_extent_refs(l, item);
731         while (1) {
732                 nritems = btrfs_header_nritems(l);
733                 if (path->slots[0] >= nritems) {
734                         ret = btrfs_next_leaf(extent_root, path);
735                         if (ret == 0)
736                                 continue;
737                         break;
738                 }
739                 btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
740                 if (found_key.objectid != bytenr)
741                         break;
742                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
743                         path->slots[0]++;
744                         continue;
745                 }
746
747                 cur_count++;
748                 ref_item = btrfs_item_ptr(l, path->slots[0],
749                                           struct btrfs_extent_ref);
750                 found_objectid = btrfs_ref_root(l, ref_item);
751
752                 if (found_objectid != root_objectid)
753                         total_count++;
754
755                 if (total_count > 1)
756                         goto out;
757
758                 if (root_objectid == 0)
759                         root_objectid = found_objectid;
760
761                 path->slots[0]++;
762         }
763         if (cur_count == 0) {
764                 total_count = 0;
765                 goto out;
766         }
767         if (total_count > 1)
768                 goto out;
769         if (level >= 0 && root->node == count_path->nodes[level])
770                 goto out;
771         level++;
772         btrfs_release_path(root, path);
773         goto again;
774
775 out:
776         btrfs_free_path(path);
777         return total_count;
778
779 }
780
781 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
782                        struct btrfs_root *root, u64 owner_objectid)
783 {
784         u64 generation;
785         u64 key_objectid;
786         u64 level;
787         u32 nritems;
788         struct btrfs_disk_key disk_key;
789
790         level = btrfs_header_level(root->node);
791         generation = trans->transid;
792         nritems = btrfs_header_nritems(root->node);
793         if (nritems > 0) {
794                 if (level == 0)
795                         btrfs_item_key(root->node, &disk_key, 0);
796                 else
797                         btrfs_node_key(root->node, &disk_key, 0);
798                 key_objectid = btrfs_disk_key_objectid(&disk_key);
799         } else {
800                 key_objectid = 0;
801         }
802         return btrfs_inc_extent_ref(trans, root, root->node->start,
803                                     root->node->len, owner_objectid,
804                                     generation, level, key_objectid);
805 }
806
807 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
808                   struct extent_buffer *buf)
809 {
810         u64 bytenr;
811         u32 nritems;
812         struct btrfs_key key;
813         struct btrfs_file_extent_item *fi;
814         int i;
815         int level;
816         int ret;
817         int faili;
818
819         if (!root->ref_cows)
820                 return 0;
821
822         level = btrfs_header_level(buf);
823         nritems = btrfs_header_nritems(buf);
824         for (i = 0; i < nritems; i++) {
825                 if (level == 0) {
826                         u64 disk_bytenr;
827                         btrfs_item_key_to_cpu(buf, &key, i);
828                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
829                                 continue;
830                         fi = btrfs_item_ptr(buf, i,
831                                             struct btrfs_file_extent_item);
832                         if (btrfs_file_extent_type(buf, fi) ==
833                             BTRFS_FILE_EXTENT_INLINE)
834                                 continue;
835                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
836                         if (disk_bytenr == 0)
837                                 continue;
838                         ret = btrfs_inc_extent_ref(trans, root, disk_bytenr,
839                                     btrfs_file_extent_disk_num_bytes(buf, fi),
840                                     root->root_key.objectid, trans->transid,
841                                     key.objectid, key.offset);
842                         if (ret) {
843                                 faili = i;
844                                 goto fail;
845                         }
846                 } else {
847                         bytenr = btrfs_node_blockptr(buf, i);
848                         btrfs_node_key_to_cpu(buf, &key, i);
849                         ret = btrfs_inc_extent_ref(trans, root, bytenr,
850                                            btrfs_level_size(root, level - 1),
851                                            root->root_key.objectid,
852                                            trans->transid,
853                                            level - 1, key.objectid);
854                         if (ret) {
855                                 faili = i;
856                                 goto fail;
857                         }
858                 }
859         }
860         return 0;
861 fail:
862         WARN_ON(1);
863 #if 0
864         for (i =0; i < faili; i++) {
865                 if (level == 0) {
866                         u64 disk_bytenr;
867                         btrfs_item_key_to_cpu(buf, &key, i);
868                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
869                                 continue;
870                         fi = btrfs_item_ptr(buf, i,
871                                             struct btrfs_file_extent_item);
872                         if (btrfs_file_extent_type(buf, fi) ==
873                             BTRFS_FILE_EXTENT_INLINE)
874                                 continue;
875                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
876                         if (disk_bytenr == 0)
877                                 continue;
878                         err = btrfs_free_extent(trans, root, disk_bytenr,
879                                     btrfs_file_extent_disk_num_bytes(buf,
880                                                                       fi), 0);
881                         BUG_ON(err);
882                 } else {
883                         bytenr = btrfs_node_blockptr(buf, i);
884                         err = btrfs_free_extent(trans, root, bytenr,
885                                         btrfs_level_size(root, level - 1), 0);
886                         BUG_ON(err);
887                 }
888         }
889 #endif
890         return ret;
891 }
892
893 static int write_one_cache_group(struct btrfs_trans_handle *trans,
894                                  struct btrfs_root *root,
895                                  struct btrfs_path *path,
896                                  struct btrfs_block_group_cache *cache)
897 {
898         int ret;
899         int pending_ret;
900         struct btrfs_root *extent_root = root->fs_info->extent_root;
901         unsigned long bi;
902         struct extent_buffer *leaf;
903
904         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
905         if (ret < 0)
906                 goto fail;
907         BUG_ON(ret);
908
909         leaf = path->nodes[0];
910         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
911         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
912         btrfs_mark_buffer_dirty(leaf);
913         btrfs_release_path(extent_root, path);
914 fail:
915         finish_current_insert(trans, extent_root);
916         pending_ret = del_pending_extents(trans, extent_root);
917         if (ret)
918                 return ret;
919         if (pending_ret)
920                 return pending_ret;
921         return 0;
922
923 }
924
925 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
926                                    struct btrfs_root *root)
927 {
928         struct extent_map_tree *block_group_cache;
929         struct btrfs_block_group_cache *cache;
930         int ret;
931         int err = 0;
932         int werr = 0;
933         struct btrfs_path *path;
934         u64 last = 0;
935         u64 start;
936         u64 end;
937         u64 ptr;
938
939         block_group_cache = &root->fs_info->block_group_cache;
940         path = btrfs_alloc_path();
941         if (!path)
942                 return -ENOMEM;
943
944         while(1) {
945                 ret = find_first_extent_bit(block_group_cache, last,
946                                             &start, &end, BLOCK_GROUP_DIRTY);
947                 if (ret)
948                         break;
949
950                 last = end + 1;
951                 ret = get_state_private(block_group_cache, start, &ptr);
952                 if (ret)
953                         break;
954
955                 cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
956                 err = write_one_cache_group(trans, root,
957                                             path, cache);
958                 /*
959                  * if we fail to write the cache group, we want
960                  * to keep it marked dirty in hopes that a later
961                  * write will work
962                  */
963                 if (err) {
964                         werr = err;
965                         continue;
966                 }
967                 clear_extent_bits(block_group_cache, start, end,
968                                   BLOCK_GROUP_DIRTY, GFP_NOFS);
969         }
970         btrfs_free_path(path);
971         return werr;
972 }
973
974 static int update_block_group(struct btrfs_trans_handle *trans,
975                               struct btrfs_root *root,
976                               u64 bytenr, u64 num_bytes, int alloc,
977                               int mark_free, int data)
978 {
979         struct btrfs_block_group_cache *cache;
980         struct btrfs_fs_info *info = root->fs_info;
981         u64 total = num_bytes;
982         u64 old_val;
983         u64 byte_in_group;
984         u64 start;
985         u64 end;
986
987         while(total) {
988                 cache = btrfs_lookup_block_group(info, bytenr);
989                 if (!cache) {
990                         return -1;
991                 }
992                 byte_in_group = bytenr - cache->key.objectid;
993                 WARN_ON(byte_in_group > cache->key.offset);
994                 start = cache->key.objectid;
995                 end = start + cache->key.offset - 1;
996                 set_extent_bits(&info->block_group_cache, start, end,
997                                 BLOCK_GROUP_DIRTY, GFP_NOFS);
998
999                 old_val = btrfs_block_group_used(&cache->item);
1000                 num_bytes = min(total, cache->key.offset - byte_in_group);
1001                 if (alloc) {
1002                         if (cache->data != data &&
1003                             old_val < (cache->key.offset >> 1)) {
1004                                 int bit_to_clear;
1005                                 int bit_to_set;
1006                                 cache->data = data;
1007                                 if (data) {
1008                                         bit_to_clear = BLOCK_GROUP_METADATA;
1009                                         bit_to_set = BLOCK_GROUP_DATA;
1010                                         cache->item.flags &=
1011                                                 ~BTRFS_BLOCK_GROUP_MIXED;
1012                                         cache->item.flags |=
1013                                                 BTRFS_BLOCK_GROUP_DATA;
1014                                 } else {
1015                                         bit_to_clear = BLOCK_GROUP_DATA;
1016                                         bit_to_set = BLOCK_GROUP_METADATA;
1017                                         cache->item.flags &=
1018                                                 ~BTRFS_BLOCK_GROUP_MIXED;
1019                                         cache->item.flags &=
1020                                                 ~BTRFS_BLOCK_GROUP_DATA;
1021                                 }
1022                                 clear_extent_bits(&info->block_group_cache,
1023                                                   start, end, bit_to_clear,
1024                                                   GFP_NOFS);
1025                                 set_extent_bits(&info->block_group_cache,
1026                                                 start, end, bit_to_set,
1027                                                 GFP_NOFS);
1028                         } else if (cache->data != data &&
1029                                    cache->data != BTRFS_BLOCK_GROUP_MIXED) {
1030                                 cache->data = BTRFS_BLOCK_GROUP_MIXED;
1031                                 set_extent_bits(&info->block_group_cache,
1032                                                 start, end,
1033                                                 BLOCK_GROUP_DATA |
1034                                                 BLOCK_GROUP_METADATA,
1035                                                 GFP_NOFS);
1036                         }
1037                         old_val += num_bytes;
1038                 } else {
1039                         old_val -= num_bytes;
1040                         if (mark_free) {
1041                                 set_extent_dirty(&info->free_space_cache,
1042                                                  bytenr, bytenr + num_bytes - 1,
1043                                                  GFP_NOFS);
1044                         }
1045                 }
1046                 btrfs_set_block_group_used(&cache->item, old_val);
1047                 total -= num_bytes;
1048                 bytenr += num_bytes;
1049         }
1050         return 0;
1051 }
1052 static int update_pinned_extents(struct btrfs_root *root,
1053                                 u64 bytenr, u64 num, int pin)
1054 {
1055         u64 len;
1056         struct btrfs_block_group_cache *cache;
1057         struct btrfs_fs_info *fs_info = root->fs_info;
1058
1059         if (pin) {
1060                 set_extent_dirty(&fs_info->pinned_extents,
1061                                 bytenr, bytenr + num - 1, GFP_NOFS);
1062         } else {
1063                 clear_extent_dirty(&fs_info->pinned_extents,
1064                                 bytenr, bytenr + num - 1, GFP_NOFS);
1065         }
1066         while (num > 0) {
1067                 cache = btrfs_lookup_block_group(fs_info, bytenr);
1068                 WARN_ON(!cache);
1069                 len = min(num, cache->key.offset -
1070                           (bytenr - cache->key.objectid));
1071                 if (pin) {
1072                         cache->pinned += len;
1073                         fs_info->total_pinned += len;
1074                 } else {
1075                         cache->pinned -= len;
1076                         fs_info->total_pinned -= len;
1077                 }
1078                 bytenr += len;
1079                 num -= len;
1080         }
1081         return 0;
1082 }
1083
1084 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy)
1085 {
1086         u64 last = 0;
1087         u64 start;
1088         u64 end;
1089         struct extent_map_tree *pinned_extents = &root->fs_info->pinned_extents;
1090         int ret;
1091
1092         while(1) {
1093                 ret = find_first_extent_bit(pinned_extents, last,
1094                                             &start, &end, EXTENT_DIRTY);
1095                 if (ret)
1096                         break;
1097                 set_extent_dirty(copy, start, end, GFP_NOFS);
1098                 last = end + 1;
1099         }
1100         return 0;
1101 }
1102
1103 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1104                                struct btrfs_root *root,
1105                                struct extent_map_tree *unpin)
1106 {
1107         u64 start;
1108         u64 end;
1109         int ret;
1110         struct extent_map_tree *free_space_cache;
1111         free_space_cache = &root->fs_info->free_space_cache;
1112
1113         while(1) {
1114                 ret = find_first_extent_bit(unpin, 0, &start, &end,
1115                                             EXTENT_DIRTY);
1116                 if (ret)
1117                         break;
1118                 update_pinned_extents(root, start, end + 1 - start, 0);
1119                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
1120                 set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
1121         }
1122         return 0;
1123 }
1124
1125 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
1126                                  btrfs_root *extent_root)
1127 {
1128         u64 start;
1129         u64 end;
1130         struct btrfs_fs_info *info = extent_root->fs_info;
1131         struct extent_buffer *eb;
1132         struct btrfs_path *path;
1133         struct btrfs_key ins;
1134         struct btrfs_disk_key first;
1135         struct btrfs_extent_item extent_item;
1136         int ret;
1137         int level;
1138         int err = 0;
1139
1140         btrfs_set_stack_extent_refs(&extent_item, 1);
1141         btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
1142         path = btrfs_alloc_path();
1143
1144         while(1) {
1145                 ret = find_first_extent_bit(&info->extent_ins, 0, &start,
1146                                             &end, EXTENT_LOCKED);
1147                 if (ret)
1148                         break;
1149
1150                 ins.objectid = start;
1151                 ins.offset = end + 1 - start;
1152                 err = btrfs_insert_item(trans, extent_root, &ins,
1153                                         &extent_item, sizeof(extent_item));
1154                 clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
1155                                   GFP_NOFS);
1156                 eb = read_tree_block(extent_root, ins.objectid, ins.offset);
1157                 level = btrfs_header_level(eb);
1158                 if (level == 0) {
1159                         btrfs_item_key(eb, &first, 0);
1160                 } else {
1161                         btrfs_node_key(eb, &first, 0);
1162                 }
1163                 err = btrfs_insert_extent_backref(trans, extent_root, path,
1164                                           start, extent_root->root_key.objectid,
1165                                           0, level,
1166                                           btrfs_disk_key_objectid(&first));
1167                 BUG_ON(err);
1168                 free_extent_buffer(eb);
1169         }
1170         btrfs_free_path(path);
1171         return 0;
1172 }
1173
1174 static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
1175                           int pending)
1176 {
1177         int err = 0;
1178         struct extent_buffer *buf;
1179
1180         if (!pending) {
1181                 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
1182                 if (buf) {
1183                         if (btrfs_buffer_uptodate(buf)) {
1184                                 u64 transid =
1185                                     root->fs_info->running_transaction->transid;
1186                                 if (btrfs_header_generation(buf) == transid) {
1187                                         free_extent_buffer(buf);
1188                                         return 1;
1189                                 }
1190                         }
1191                         free_extent_buffer(buf);
1192                 }
1193                 update_pinned_extents(root, bytenr, num_bytes, 1);
1194         } else {
1195                 set_extent_bits(&root->fs_info->pending_del,
1196                                 bytenr, bytenr + num_bytes - 1,
1197                                 EXTENT_LOCKED, GFP_NOFS);
1198         }
1199         BUG_ON(err < 0);
1200         return 0;
1201 }
1202
1203 /*
1204  * remove an extent from the root, returns 0 on success
1205  */
1206 static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
1207                          *root, u64 bytenr, u64 num_bytes,
1208                          u64 root_objectid, u64 ref_generation,
1209                          u64 owner_objectid, u64 owner_offset, int pin,
1210                          int mark_free)
1211 {
1212         struct btrfs_path *path;
1213         struct btrfs_key key;
1214         struct btrfs_fs_info *info = root->fs_info;
1215         struct btrfs_root *extent_root = info->extent_root;
1216         struct extent_buffer *leaf;
1217         int ret;
1218         struct btrfs_extent_item *ei;
1219         u32 refs;
1220
1221         key.objectid = bytenr;
1222         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1223         key.offset = num_bytes;
1224
1225         path = btrfs_alloc_path();
1226         if (!path)
1227                 return -ENOMEM;
1228
1229         ret = lookup_extent_backref(trans, extent_root, path,
1230                                     bytenr, root_objectid,
1231                                     ref_generation,
1232                                     owner_objectid, owner_offset, 1);
1233         if (ret == 0) {
1234                 ret = btrfs_del_item(trans, extent_root, path);
1235         } else {
1236                 btrfs_print_leaf(extent_root, path->nodes[0]);
1237                 WARN_ON(1);
1238                 printk("Unable to find ref byte nr %Lu root %Lu "
1239                        " gen %Lu owner %Lu offset %Lu\n", bytenr,
1240                        root_objectid, ref_generation, owner_objectid,
1241                        owner_offset);
1242         }
1243         btrfs_release_path(extent_root, path);
1244         ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
1245         if (ret < 0)
1246                 return ret;
1247         BUG_ON(ret);
1248
1249         leaf = path->nodes[0];
1250         ei = btrfs_item_ptr(leaf, path->slots[0],
1251                             struct btrfs_extent_item);
1252         refs = btrfs_extent_refs(leaf, ei);
1253         BUG_ON(refs == 0);
1254         refs -= 1;
1255         btrfs_set_extent_refs(leaf, ei, refs);
1256         btrfs_mark_buffer_dirty(leaf);
1257
1258         if (refs == 0) {
1259                 u64 super_used;
1260                 u64 root_used;
1261
1262                 if (pin) {
1263                         ret = pin_down_bytes(root, bytenr, num_bytes, 0);
1264                         if (ret > 0)
1265                                 mark_free = 1;
1266                         BUG_ON(ret < 0);
1267                 }
1268
1269                 /* block accounting for super block */
1270                 super_used = btrfs_super_bytes_used(&info->super_copy);
1271                 btrfs_set_super_bytes_used(&info->super_copy,
1272                                            super_used - num_bytes);
1273
1274                 /* block accounting for root item */
1275                 root_used = btrfs_root_used(&root->root_item);
1276                 btrfs_set_root_used(&root->root_item,
1277                                            root_used - num_bytes);
1278
1279                 ret = btrfs_del_item(trans, extent_root, path);
1280                 if (ret) {
1281                         return ret;
1282                 }
1283                 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
1284                                          mark_free, 0);
1285                 BUG_ON(ret);
1286         }
1287         btrfs_free_path(path);
1288         finish_current_insert(trans, extent_root);
1289         return ret;
1290 }
1291
1292 /*
1293  * find all the blocks marked as pending in the radix tree and remove
1294  * them from the extent map
1295  */
1296 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
1297                                btrfs_root *extent_root)
1298 {
1299         int ret;
1300         int err = 0;
1301         u64 start;
1302         u64 end;
1303         struct extent_map_tree *pending_del;
1304         struct extent_map_tree *pinned_extents;
1305
1306         pending_del = &extent_root->fs_info->pending_del;
1307         pinned_extents = &extent_root->fs_info->pinned_extents;
1308
1309         while(1) {
1310                 ret = find_first_extent_bit(pending_del, 0, &start, &end,
1311                                             EXTENT_LOCKED);
1312                 if (ret)
1313                         break;
1314                 update_pinned_extents(extent_root, start, end + 1 - start, 1);
1315                 clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
1316                                   GFP_NOFS);
1317                 ret = __free_extent(trans, extent_root,
1318                                      start, end + 1 - start,
1319                                      extent_root->root_key.objectid,
1320                                      0, 0, 0, 0, 0);
1321                 if (ret)
1322                         err = ret;
1323         }
1324         return err;
1325 }
1326
1327 /*
1328  * remove an extent from the root, returns 0 on success
1329  */
1330 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
1331                       *root, u64 bytenr, u64 num_bytes,
1332                       u64 root_objectid, u64 ref_generation,
1333                       u64 owner_objectid, u64 owner_offset, int pin)
1334 {
1335         struct btrfs_root *extent_root = root->fs_info->extent_root;
1336         int pending_ret;
1337         int ret;
1338
1339         WARN_ON(num_bytes < root->sectorsize);
1340         if (!root->ref_cows)
1341                 ref_generation = 0;
1342
1343         if (root == extent_root) {
1344                 pin_down_bytes(root, bytenr, num_bytes, 1);
1345                 return 0;
1346         }
1347         ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
1348                             ref_generation, owner_objectid, owner_offset,
1349                             pin, pin == 0);
1350         pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
1351         return ret ? ret : pending_ret;
1352 }
1353
1354 static u64 stripe_align(struct btrfs_root *root, u64 val)
1355 {
1356         u64 mask = ((u64)root->stripesize - 1);
1357         u64 ret = (val + mask) & ~mask;
1358         return ret;
1359 }
1360
1361 /*
1362  * walks the btree of allocated extents and find a hole of a given size.
1363  * The key ins is changed to record the hole:
1364  * ins->objectid == block start
1365  * ins->flags = BTRFS_EXTENT_ITEM_KEY
1366  * ins->offset == number of blocks
1367  * Any available blocks before search_start are skipped.
1368  */
1369 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
1370                             *orig_root, u64 num_bytes, u64 empty_size,
1371                             u64 search_start, u64 search_end, u64 hint_byte,
1372                             struct btrfs_key *ins, u64 exclude_start,
1373                             u64 exclude_nr, int data)
1374 {
1375         struct btrfs_path *path;
1376         struct btrfs_key key;
1377         u64 hole_size = 0;
1378         u64 aligned;
1379         int ret;
1380         int slot = 0;
1381         u64 last_byte = 0;
1382         u64 orig_search_start = search_start;
1383         int start_found;
1384         struct extent_buffer *l;
1385         struct btrfs_root * root = orig_root->fs_info->extent_root;
1386         struct btrfs_fs_info *info = root->fs_info;
1387         u64 total_needed = num_bytes;
1388         int level;
1389         struct btrfs_block_group_cache *block_group;
1390         int full_scan = 0;
1391         int wrapped = 0;
1392         u64 cached_start;
1393
1394         WARN_ON(num_bytes < root->sectorsize);
1395         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
1396
1397         level = btrfs_header_level(root->node);
1398
1399         if (num_bytes >= 32 * 1024 * 1024 && hint_byte) {
1400                 data = BTRFS_BLOCK_GROUP_MIXED;
1401         }
1402
1403         if (search_end == (u64)-1)
1404                 search_end = btrfs_super_total_bytes(&info->super_copy);
1405         if (hint_byte) {
1406                 block_group = btrfs_lookup_block_group(info, hint_byte);
1407                 if (!block_group)
1408                         hint_byte = search_start;
1409                 block_group = btrfs_find_block_group(root, block_group,
1410                                                      hint_byte, data, 1);
1411         } else {
1412                 block_group = btrfs_find_block_group(root,
1413                                                      trans->block_group,
1414                                                      search_start, data, 1);
1415         }
1416
1417         total_needed += empty_size;
1418         path = btrfs_alloc_path();
1419 check_failed:
1420         if (!block_group) {
1421                 block_group = btrfs_lookup_block_group(info, search_start);
1422                 if (!block_group)
1423                         block_group = btrfs_lookup_block_group(info,
1424                                                        orig_search_start);
1425         }
1426         search_start = find_search_start(root, &block_group, search_start,
1427                                          total_needed, data, full_scan);
1428         search_start = stripe_align(root, search_start);
1429         cached_start = search_start;
1430         btrfs_init_path(path);
1431         ins->objectid = search_start;
1432         ins->offset = 0;
1433         start_found = 0;
1434         path->reada = 2;
1435
1436         ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
1437         if (ret < 0)
1438                 goto error;
1439
1440         if (path->slots[0] > 0) {
1441                 path->slots[0]--;
1442         }
1443
1444         l = path->nodes[0];
1445         btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1446
1447         /*
1448          * walk backwards to find the first extent item key
1449          */
1450         while(btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) {
1451                 if (path->slots[0] == 0) {
1452                         ret = btrfs_prev_leaf(root, path);
1453                         if (ret != 0) {
1454                                 ret = btrfs_search_slot(trans, root, ins,
1455                                                         path, 0, 0);
1456                                 if (ret < 0)
1457                                         goto error;
1458                                 if (path->slots[0] > 0)
1459                                         path->slots[0]--;
1460                                 break;
1461                         }
1462                 } else {
1463                         path->slots[0]--;
1464                 }
1465                 l = path->nodes[0];
1466                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1467         }
1468         while (1) {
1469                 l = path->nodes[0];
1470                 slot = path->slots[0];
1471                 if (slot >= btrfs_header_nritems(l)) {
1472                         ret = btrfs_next_leaf(root, path);
1473                         if (ret == 0)
1474                                 continue;
1475                         if (ret < 0)
1476                                 goto error;
1477
1478                         search_start = max(search_start,
1479                                            block_group->key.objectid);
1480                         if (!start_found) {
1481                                 aligned = stripe_align(root, search_start);
1482                                 ins->objectid = aligned;
1483                                 if (aligned >= search_end) {
1484                                         ret = -ENOSPC;
1485                                         goto error;
1486                                 }
1487                                 ins->offset = search_end - aligned;
1488                                 start_found = 1;
1489                                 goto check_pending;
1490                         }
1491                         ins->objectid = stripe_align(root,
1492                                                      last_byte > search_start ?
1493                                                      last_byte : search_start);
1494                         if (search_end <= ins->objectid) {
1495                                 ret = -ENOSPC;
1496                                 goto error;
1497                         }
1498                         ins->offset = search_end - ins->objectid;
1499                         BUG_ON(ins->objectid >= search_end);
1500                         goto check_pending;
1501                 }
1502                 btrfs_item_key_to_cpu(l, &key, slot);
1503
1504                 if (key.objectid >= search_start && key.objectid > last_byte &&
1505                     start_found) {
1506                         if (last_byte < search_start)
1507                                 last_byte = search_start;
1508                         aligned = stripe_align(root, last_byte);
1509                         hole_size = key.objectid - aligned;
1510                         if (key.objectid > aligned && hole_size >= num_bytes) {
1511                                 ins->objectid = aligned;
1512                                 ins->offset = hole_size;
1513                                 goto check_pending;
1514                         }
1515                 }
1516                 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) {
1517                         if (!start_found && btrfs_key_type(&key) ==
1518                             BTRFS_BLOCK_GROUP_ITEM_KEY) {
1519                                 last_byte = key.objectid;
1520                                 start_found = 1;
1521                         }
1522                         goto next;
1523                 }
1524
1525
1526                 start_found = 1;
1527                 last_byte = key.objectid + key.offset;
1528
1529                 if (!full_scan && data != BTRFS_BLOCK_GROUP_MIXED &&
1530                     last_byte >= block_group->key.objectid +
1531                     block_group->key.offset) {
1532                         btrfs_release_path(root, path);
1533                         search_start = block_group->key.objectid +
1534                                 block_group->key.offset;
1535                         goto new_group;
1536                 }
1537 next:
1538                 path->slots[0]++;
1539                 cond_resched();
1540         }
1541 check_pending:
1542         /* we have to make sure we didn't find an extent that has already
1543          * been allocated by the map tree or the original allocation
1544          */
1545         btrfs_release_path(root, path);
1546         BUG_ON(ins->objectid < search_start);
1547
1548         if (ins->objectid + num_bytes >= search_end)
1549                 goto enospc;
1550         if (!full_scan && data != BTRFS_BLOCK_GROUP_MIXED &&
1551             ins->objectid + num_bytes > block_group->
1552             key.objectid + block_group->key.offset) {
1553                 search_start = block_group->key.objectid +
1554                         block_group->key.offset;
1555                 goto new_group;
1556         }
1557         if (test_range_bit(&info->extent_ins, ins->objectid,
1558                            ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) {
1559                 search_start = ins->objectid + num_bytes;
1560                 goto new_group;
1561         }
1562         if (test_range_bit(&info->pinned_extents, ins->objectid,
1563                            ins->objectid + num_bytes -1, EXTENT_DIRTY, 0)) {
1564                 search_start = ins->objectid + num_bytes;
1565                 goto new_group;
1566         }
1567         if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
1568             ins->objectid < exclude_start + exclude_nr)) {
1569                 search_start = exclude_start + exclude_nr;
1570                 goto new_group;
1571         }
1572         if (!data) {
1573                 block_group = btrfs_lookup_block_group(info, ins->objectid);
1574                 if (block_group)
1575                         trans->block_group = block_group;
1576         }
1577         ins->offset = num_bytes;
1578         btrfs_free_path(path);
1579         return 0;
1580
1581 new_group:
1582         if (search_start + num_bytes >= search_end) {
1583 enospc:
1584                 search_start = orig_search_start;
1585                 if (full_scan) {
1586                         ret = -ENOSPC;
1587                         goto error;
1588                 }
1589                 if (wrapped) {
1590                         if (!full_scan)
1591                                 total_needed -= empty_size;
1592                         full_scan = 1;
1593                         data = BTRFS_BLOCK_GROUP_MIXED;
1594                 } else
1595                         wrapped = 1;
1596         }
1597         block_group = btrfs_lookup_block_group(info, search_start);
1598         cond_resched();
1599         block_group = btrfs_find_block_group(root, block_group,
1600                                              search_start, data, 0);
1601         goto check_failed;
1602
1603 error:
1604         btrfs_release_path(root, path);
1605         btrfs_free_path(path);
1606         return ret;
1607 }
1608 /*
1609  * finds a free extent and does all the dirty work required for allocation
1610  * returns the key for the extent through ins, and a tree buffer for
1611  * the first block of the extent through buf.
1612  *
1613  * returns 0 if everything worked, non-zero otherwise.
1614  */
1615 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1616                        struct btrfs_root *root,
1617                        u64 num_bytes, u64 root_objectid, u64 ref_generation,
1618                        u64 owner, u64 owner_offset,
1619                        u64 empty_size, u64 hint_byte,
1620                        u64 search_end, struct btrfs_key *ins, int data)
1621 {
1622         int ret;
1623         int pending_ret;
1624         u64 super_used, root_used;
1625         u64 search_start = 0;
1626         u64 new_hint;
1627         struct btrfs_fs_info *info = root->fs_info;
1628         struct btrfs_root *extent_root = info->extent_root;
1629         struct btrfs_extent_item extent_item;
1630         struct btrfs_path *path;
1631
1632         btrfs_set_stack_extent_refs(&extent_item, 1);
1633
1634         new_hint = max(hint_byte, root->fs_info->alloc_start);
1635         if (new_hint < btrfs_super_total_bytes(&info->super_copy))
1636                 hint_byte = new_hint;
1637
1638         WARN_ON(num_bytes < root->sectorsize);
1639         ret = find_free_extent(trans, root, num_bytes, empty_size,
1640                                search_start, search_end, hint_byte, ins,
1641                                trans->alloc_exclude_start,
1642                                trans->alloc_exclude_nr, data);
1643         BUG_ON(ret);
1644         if (ret)
1645                 return ret;
1646
1647         /* block accounting for super block */
1648         super_used = btrfs_super_bytes_used(&info->super_copy);
1649         btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
1650
1651         /* block accounting for root item */
1652         root_used = btrfs_root_used(&root->root_item);
1653         btrfs_set_root_used(&root->root_item, root_used + num_bytes);
1654
1655         clear_extent_dirty(&root->fs_info->free_space_cache,
1656                            ins->objectid, ins->objectid + ins->offset - 1,
1657                            GFP_NOFS);
1658
1659         if (root == extent_root) {
1660                 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
1661                                 ins->objectid + ins->offset - 1,
1662                                 EXTENT_LOCKED, GFP_NOFS);
1663                 WARN_ON(data == 1);
1664                 goto update_block;
1665         }
1666
1667         WARN_ON(trans->alloc_exclude_nr);
1668         trans->alloc_exclude_start = ins->objectid;
1669         trans->alloc_exclude_nr = ins->offset;
1670         ret = btrfs_insert_item(trans, extent_root, ins, &extent_item,
1671                                 sizeof(extent_item));
1672
1673         trans->alloc_exclude_start = 0;
1674         trans->alloc_exclude_nr = 0;
1675         BUG_ON(ret);
1676
1677         path = btrfs_alloc_path();
1678         BUG_ON(!path);
1679         ret = btrfs_insert_extent_backref(trans, extent_root, path,
1680                                           ins->objectid, root_objectid,
1681                                           ref_generation, owner, owner_offset);
1682
1683         BUG_ON(ret);
1684         btrfs_free_path(path);
1685         finish_current_insert(trans, extent_root);
1686         pending_ret = del_pending_extents(trans, extent_root);
1687
1688         if (ret) {
1689                 return ret;
1690         }
1691         if (pending_ret) {
1692                 return pending_ret;
1693         }
1694
1695 update_block:
1696         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0,
1697                                  data);
1698         BUG_ON(ret);
1699         return 0;
1700 }
1701
1702 /*
1703  * helper function to allocate a block for a given tree
1704  * returns the tree buffer or NULL.
1705  */
1706 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1707                                              struct btrfs_root *root,
1708                                              u32 blocksize,
1709                                              u64 root_objectid, u64 hint,
1710                                              u64 empty_size)
1711 {
1712         u64 ref_generation;
1713
1714         if (root->ref_cows)
1715                 ref_generation = trans->transid;
1716         else
1717                 ref_generation = 0;
1718
1719
1720         return __btrfs_alloc_free_block(trans, root, blocksize, root_objectid,
1721                                         ref_generation, 0, 0, hint, empty_size);
1722 }
1723
1724 /*
1725  * helper function to allocate a block for a given tree
1726  * returns the tree buffer or NULL.
1727  */
1728 struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1729                                              struct btrfs_root *root,
1730                                              u32 blocksize,
1731                                              u64 root_objectid,
1732                                              u64 ref_generation,
1733                                              u64 first_objectid,
1734                                              int level,
1735                                              u64 hint,
1736                                              u64 empty_size)
1737 {
1738         struct btrfs_key ins;
1739         int ret;
1740         struct extent_buffer *buf;
1741
1742         ret = btrfs_alloc_extent(trans, root, blocksize,
1743                                  root_objectid, ref_generation,
1744                                  level, first_objectid, empty_size, hint,
1745                                  (u64)-1, &ins, 0);
1746         if (ret) {
1747                 BUG_ON(ret > 0);
1748                 return ERR_PTR(ret);
1749         }
1750         buf = btrfs_find_create_tree_block(root, ins.objectid, blocksize);
1751         if (!buf) {
1752                 btrfs_free_extent(trans, root, ins.objectid, blocksize,
1753                                   root->root_key.objectid, ref_generation,
1754                                   0, 0, 0);
1755                 return ERR_PTR(-ENOMEM);
1756         }
1757         btrfs_set_buffer_uptodate(buf);
1758         set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
1759                          buf->start + buf->len - 1, GFP_NOFS);
1760         set_extent_bits(&BTRFS_I(root->fs_info->btree_inode)->extent_tree,
1761                         buf->start, buf->start + buf->len - 1,
1762                         EXTENT_CSUM, GFP_NOFS);
1763         buf->flags |= EXTENT_CSUM;
1764         btrfs_set_buffer_defrag(buf);
1765         trans->blocks_used++;
1766         return buf;
1767 }
1768
1769 static int drop_leaf_ref(struct btrfs_trans_handle *trans,
1770                          struct btrfs_root *root, struct extent_buffer *leaf)
1771 {
1772         u64 leaf_owner;
1773         u64 leaf_generation;
1774         struct btrfs_key key;
1775         struct btrfs_file_extent_item *fi;
1776         int i;
1777         int nritems;
1778         int ret;
1779
1780         BUG_ON(!btrfs_is_leaf(leaf));
1781         nritems = btrfs_header_nritems(leaf);
1782         leaf_owner = btrfs_header_owner(leaf);
1783         leaf_generation = btrfs_header_generation(leaf);
1784
1785         for (i = 0; i < nritems; i++) {
1786                 u64 disk_bytenr;
1787
1788                 btrfs_item_key_to_cpu(leaf, &key, i);
1789                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1790                         continue;
1791                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1792                 if (btrfs_file_extent_type(leaf, fi) ==
1793                     BTRFS_FILE_EXTENT_INLINE)
1794                         continue;
1795                 /*
1796                  * FIXME make sure to insert a trans record that
1797                  * repeats the snapshot del on crash
1798                  */
1799                 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1800                 if (disk_bytenr == 0)
1801                         continue;
1802                 ret = btrfs_free_extent(trans, root, disk_bytenr,
1803                                 btrfs_file_extent_disk_num_bytes(leaf, fi),
1804                                 leaf_owner, leaf_generation,
1805                                 key.objectid, key.offset, 0);
1806                 BUG_ON(ret);
1807         }
1808         return 0;
1809 }
1810
1811 static void reada_walk_down(struct btrfs_root *root,
1812                             struct extent_buffer *node)
1813 {
1814         int i;
1815         u32 nritems;
1816         u64 bytenr;
1817         int ret;
1818         u32 refs;
1819         int level;
1820         u32 blocksize;
1821
1822         nritems = btrfs_header_nritems(node);
1823         level = btrfs_header_level(node);
1824         for (i = 0; i < nritems; i++) {
1825                 bytenr = btrfs_node_blockptr(node, i);
1826                 blocksize = btrfs_level_size(root, level - 1);
1827                 ret = lookup_extent_ref(NULL, root, bytenr, blocksize, &refs);
1828                 BUG_ON(ret);
1829                 if (refs != 1)
1830                         continue;
1831                 mutex_unlock(&root->fs_info->fs_mutex);
1832                 ret = readahead_tree_block(root, bytenr, blocksize);
1833                 cond_resched();
1834                 mutex_lock(&root->fs_info->fs_mutex);
1835                 if (ret)
1836                         break;
1837         }
1838 }
1839
1840 /*
1841  * helper function for drop_snapshot, this walks down the tree dropping ref
1842  * counts as it goes.
1843  */
1844 static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
1845                           *root, struct btrfs_path *path, int *level)
1846 {
1847         u64 root_owner;
1848         u64 root_gen;
1849         u64 bytenr;
1850         struct extent_buffer *next;
1851         struct extent_buffer *cur;
1852         struct extent_buffer *parent;
1853         u32 blocksize;
1854         int ret;
1855         u32 refs;
1856
1857         WARN_ON(*level < 0);
1858         WARN_ON(*level >= BTRFS_MAX_LEVEL);
1859         ret = lookup_extent_ref(trans, root,
1860                                 path->nodes[*level]->start,
1861                                 path->nodes[*level]->len, &refs);
1862         BUG_ON(ret);
1863         if (refs > 1)
1864                 goto out;
1865
1866         /*
1867          * walk down to the last node level and free all the leaves
1868          */
1869         while(*level >= 0) {
1870                 WARN_ON(*level < 0);
1871                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1872                 cur = path->nodes[*level];
1873
1874                 if (*level > 0 && path->slots[*level] == 0)
1875                         reada_walk_down(root, cur);
1876
1877                 if (btrfs_header_level(cur) != *level)
1878                         WARN_ON(1);
1879
1880                 if (path->slots[*level] >=
1881                     btrfs_header_nritems(cur))
1882                         break;
1883                 if (*level == 0) {
1884                         ret = drop_leaf_ref(trans, root, cur);
1885                         BUG_ON(ret);
1886                         break;
1887                 }
1888                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1889                 blocksize = btrfs_level_size(root, *level - 1);
1890                 ret = lookup_extent_ref(trans, root, bytenr, blocksize, &refs);
1891                 BUG_ON(ret);
1892                 if (refs != 1) {
1893                         parent = path->nodes[*level];
1894                         root_owner = btrfs_header_owner(parent);
1895                         root_gen = btrfs_header_generation(parent);
1896                         path->slots[*level]++;
1897                         ret = btrfs_free_extent(trans, root, bytenr,
1898                                                 blocksize, root_owner,
1899                                                 root_gen, 0, 0, 1);
1900                         BUG_ON(ret);
1901                         continue;
1902                 }
1903                 next = btrfs_find_tree_block(root, bytenr, blocksize);
1904                 if (!next || !btrfs_buffer_uptodate(next)) {
1905                         free_extent_buffer(next);
1906                         mutex_unlock(&root->fs_info->fs_mutex);
1907                         next = read_tree_block(root, bytenr, blocksize);
1908                         mutex_lock(&root->fs_info->fs_mutex);
1909
1910                         /* we dropped the lock, check one more time */
1911                         ret = lookup_extent_ref(trans, root, bytenr,
1912                                                 blocksize, &refs);
1913                         BUG_ON(ret);
1914                         if (refs != 1) {
1915                                 parent = path->nodes[*level];
1916                                 root_owner = btrfs_header_owner(parent);
1917                                 root_gen = btrfs_header_generation(parent);
1918
1919                                 path->slots[*level]++;
1920                                 free_extent_buffer(next);
1921                                 ret = btrfs_free_extent(trans, root, bytenr,
1922                                                         blocksize,
1923                                                         root_owner,
1924                                                         root_gen, 0, 0, 1);
1925                                 BUG_ON(ret);
1926                                 continue;
1927                         }
1928                 }
1929                 WARN_ON(*level <= 0);
1930                 if (path->nodes[*level-1])
1931                         free_extent_buffer(path->nodes[*level-1]);
1932                 path->nodes[*level-1] = next;
1933                 *level = btrfs_header_level(next);
1934                 path->slots[*level] = 0;
1935         }
1936 out:
1937         WARN_ON(*level < 0);
1938         WARN_ON(*level >= BTRFS_MAX_LEVEL);
1939
1940         if (path->nodes[*level] == root->node) {
1941                 root_owner = root->root_key.objectid;
1942                 parent = path->nodes[*level];
1943         } else {
1944                 parent = path->nodes[*level + 1];
1945                 root_owner = btrfs_header_owner(parent);
1946         }
1947
1948         root_gen = btrfs_header_generation(parent);
1949         ret = btrfs_free_extent(trans, root, path->nodes[*level]->start,
1950                                 path->nodes[*level]->len,
1951                                 root_owner, root_gen, 0, 0, 1);
1952         free_extent_buffer(path->nodes[*level]);
1953         path->nodes[*level] = NULL;
1954         *level += 1;
1955         BUG_ON(ret);
1956         return 0;
1957 }
1958
1959 /*
1960  * helper for dropping snapshots.  This walks back up the tree in the path
1961  * to find the first node higher up where we haven't yet gone through
1962  * all the slots
1963  */
1964 static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root
1965                         *root, struct btrfs_path *path, int *level)
1966 {
1967         u64 root_owner;
1968         u64 root_gen;
1969         struct btrfs_root_item *root_item = &root->root_item;
1970         int i;
1971         int slot;
1972         int ret;
1973
1974         for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1975                 slot = path->slots[i];
1976                 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
1977                         struct extent_buffer *node;
1978                         struct btrfs_disk_key disk_key;
1979                         node = path->nodes[i];
1980                         path->slots[i]++;
1981                         *level = i;
1982                         WARN_ON(*level == 0);
1983                         btrfs_node_key(node, &disk_key, path->slots[i]);
1984                         memcpy(&root_item->drop_progress,
1985                                &disk_key, sizeof(disk_key));
1986                         root_item->drop_level = i;
1987                         return 0;
1988                 } else {
1989                         if (path->nodes[*level] == root->node) {
1990                                 root_owner = root->root_key.objectid;
1991                                 root_gen =
1992                                    btrfs_header_generation(path->nodes[*level]);
1993                         } else {
1994                                 struct extent_buffer *node;
1995                                 node = path->nodes[*level + 1];
1996                                 root_owner = btrfs_header_owner(node);
1997                                 root_gen = btrfs_header_generation(node);
1998                         }
1999                         ret = btrfs_free_extent(trans, root,
2000                                                 path->nodes[*level]->start,
2001                                                 path->nodes[*level]->len,
2002                                                 root_owner, root_gen, 0, 0, 1);
2003                         BUG_ON(ret);
2004                         free_extent_buffer(path->nodes[*level]);
2005                         path->nodes[*level] = NULL;
2006                         *level = i + 1;
2007                 }
2008         }
2009         return 1;
2010 }
2011
2012 /*
2013  * drop the reference count on the tree rooted at 'snap'.  This traverses
2014  * the tree freeing any blocks that have a ref count of zero after being
2015  * decremented.
2016  */
2017 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
2018                         *root)
2019 {
2020         int ret = 0;
2021         int wret;
2022         int level;
2023         struct btrfs_path *path;
2024         int i;
2025         int orig_level;
2026         struct btrfs_root_item *root_item = &root->root_item;
2027
2028         path = btrfs_alloc_path();
2029         BUG_ON(!path);
2030
2031         level = btrfs_header_level(root->node);
2032         orig_level = level;
2033         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2034                 path->nodes[level] = root->node;
2035                 extent_buffer_get(root->node);
2036                 path->slots[level] = 0;
2037         } else {
2038                 struct btrfs_key key;
2039                 struct btrfs_disk_key found_key;
2040                 struct extent_buffer *node;
2041
2042                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2043                 level = root_item->drop_level;
2044                 path->lowest_level = level;
2045                 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2046                 if (wret < 0) {
2047                         ret = wret;
2048                         goto out;
2049                 }
2050                 node = path->nodes[level];
2051                 btrfs_node_key(node, &found_key, path->slots[level]);
2052                 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
2053                                sizeof(found_key)));
2054         }
2055         while(1) {
2056                 wret = walk_down_tree(trans, root, path, &level);
2057                 if (wret > 0)
2058                         break;
2059                 if (wret < 0)
2060                         ret = wret;
2061
2062                 wret = walk_up_tree(trans, root, path, &level);
2063                 if (wret > 0)
2064                         break;
2065                 if (wret < 0)
2066                         ret = wret;
2067                 ret = -EAGAIN;
2068                 break;
2069         }
2070         for (i = 0; i <= orig_level; i++) {
2071                 if (path->nodes[i]) {
2072                         free_extent_buffer(path->nodes[i]);
2073                         path->nodes[i] = NULL;
2074                 }
2075         }
2076 out:
2077         btrfs_free_path(path);
2078         return ret;
2079 }
2080
2081 int btrfs_free_block_groups(struct btrfs_fs_info *info)
2082 {
2083         u64 start;
2084         u64 end;
2085         u64 ptr;
2086         int ret;
2087         while(1) {
2088                 ret = find_first_extent_bit(&info->block_group_cache, 0,
2089                                             &start, &end, (unsigned int)-1);
2090                 if (ret)
2091                         break;
2092                 ret = get_state_private(&info->block_group_cache, start, &ptr);
2093                 if (!ret)
2094                         kfree((void *)(unsigned long)ptr);
2095                 clear_extent_bits(&info->block_group_cache, start,
2096                                   end, (unsigned int)-1, GFP_NOFS);
2097         }
2098         while(1) {
2099                 ret = find_first_extent_bit(&info->free_space_cache, 0,
2100                                             &start, &end, EXTENT_DIRTY);
2101                 if (ret)
2102                         break;
2103                 clear_extent_dirty(&info->free_space_cache, start,
2104                                    end, GFP_NOFS);
2105         }
2106         return 0;
2107 }
2108
2109 static int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
2110 {
2111         u64 page_start;
2112         u64 page_end;
2113         u64 delalloc_start;
2114         u64 existing_delalloc;
2115         unsigned long last_index;
2116         unsigned long first_index;
2117         unsigned long i;
2118         struct page *page;
2119         struct btrfs_root *root = BTRFS_I(inode)->root;
2120         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2121         struct file_ra_state ra;
2122
2123         mutex_lock(&inode->i_mutex);
2124         first_index = start >> PAGE_CACHE_SHIFT;
2125         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
2126
2127         memset(&ra, 0, sizeof(ra));
2128         file_ra_state_init(&ra, inode->i_mapping);
2129         btrfs_force_ra(inode->i_mapping, &ra, NULL, first_index, last_index);
2130
2131         for (i = first_index; i <= last_index; i++) {
2132                 page = grab_cache_page(inode->i_mapping, i);
2133                 if (!page)
2134                         goto out_unlock;
2135                 if (!PageUptodate(page)) {
2136                         btrfs_readpage(NULL, page);
2137                         lock_page(page);
2138                         if (!PageUptodate(page)) {
2139                                 unlock_page(page);
2140                                 page_cache_release(page);
2141                                 goto out_unlock;
2142                         }
2143                 }
2144                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2145                 page_end = page_start + PAGE_CACHE_SIZE - 1;
2146
2147                 lock_extent(em_tree, page_start, page_end, GFP_NOFS);
2148
2149                 delalloc_start = page_start;
2150                 existing_delalloc =
2151                         count_range_bits(&BTRFS_I(inode)->extent_tree,
2152                                          &delalloc_start, page_end,
2153                                          PAGE_CACHE_SIZE, EXTENT_DELALLOC);
2154
2155                 set_extent_delalloc(em_tree, page_start,
2156                                     page_end, GFP_NOFS);
2157
2158                 spin_lock(&root->fs_info->delalloc_lock);
2159                 root->fs_info->delalloc_bytes += PAGE_CACHE_SIZE -
2160                                                  existing_delalloc;
2161                 spin_unlock(&root->fs_info->delalloc_lock);
2162
2163                 unlock_extent(em_tree, page_start, page_end, GFP_NOFS);
2164                 set_page_dirty(page);
2165                 unlock_page(page);
2166                 page_cache_release(page);
2167         }
2168
2169 out_unlock:
2170         mutex_unlock(&inode->i_mutex);
2171         return 0;
2172 }
2173
2174 static int relocate_one_reference(struct btrfs_root *extent_root,
2175                                   struct btrfs_path *path,
2176                                   struct btrfs_key *extent_key,
2177                                   u64 ref_root, u64 ref_gen, u64 ref_objectid,
2178                                   u64 ref_offset)
2179 {
2180         struct inode *inode;
2181         struct btrfs_root *found_root;
2182         struct btrfs_key root_location;
2183         int ret;
2184
2185         root_location.objectid = ref_root;
2186         if (ref_gen == 0)
2187                 root_location.offset = 0;
2188         else
2189                 root_location.offset = (u64)-1;
2190         root_location.type = BTRFS_ROOT_ITEM_KEY;
2191
2192         found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
2193                                                 &root_location);
2194         BUG_ON(!found_root);
2195
2196         if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2197                 mutex_unlock(&extent_root->fs_info->fs_mutex);
2198                 inode = btrfs_iget_locked(extent_root->fs_info->sb,
2199                                           ref_objectid, found_root);
2200                 if (inode->i_state & I_NEW) {
2201                         /* the inode and parent dir are two different roots */
2202                         BTRFS_I(inode)->root = found_root;
2203                         BTRFS_I(inode)->location.objectid = ref_objectid;
2204                         BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
2205                         BTRFS_I(inode)->location.offset = 0;
2206                         btrfs_read_locked_inode(inode);
2207                         unlock_new_inode(inode);
2208
2209                 }
2210                 /* this can happen if the reference is not against
2211                  * the latest version of the tree root
2212                  */
2213                 if (is_bad_inode(inode)) {
2214                         mutex_lock(&extent_root->fs_info->fs_mutex);
2215                         goto out;
2216                 }
2217                 relocate_inode_pages(inode, ref_offset, extent_key->offset);
2218                 /* FIXME, data=ordered will help get rid of this */
2219                 filemap_fdatawrite(inode->i_mapping);
2220                 iput(inode);
2221                 mutex_lock(&extent_root->fs_info->fs_mutex);
2222         } else {
2223                 struct btrfs_trans_handle *trans;
2224                 struct btrfs_key found_key;
2225                 struct extent_buffer *eb;
2226                 int level;
2227                 int i;
2228
2229                 trans = btrfs_start_transaction(found_root, 1);
2230                 eb = read_tree_block(found_root, extent_key->objectid,
2231                                      extent_key->offset);
2232                 level = btrfs_header_level(eb);
2233
2234                 if (level == 0)
2235                         btrfs_item_key_to_cpu(eb, &found_key, 0);
2236                 else
2237                         btrfs_node_key_to_cpu(eb, &found_key, 0);
2238
2239                 free_extent_buffer(eb);
2240
2241                 path->lowest_level = level;
2242                 path->reada = 2;
2243                 ret = btrfs_search_slot(trans, found_root, &found_key, path,
2244                                         0, 1);
2245                 path->lowest_level = 0;
2246                 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2247                         if (!path->nodes[i])
2248                                 break;
2249                         free_extent_buffer(path->nodes[i]);
2250                         path->nodes[i] = NULL;
2251                 }
2252                 btrfs_release_path(found_root, path);
2253                 btrfs_end_transaction(trans, found_root);
2254         }
2255
2256 out:
2257         return 0;
2258 }
2259
2260 static int relocate_one_extent(struct btrfs_root *extent_root,
2261                                struct btrfs_path *path,
2262                                struct btrfs_key *extent_key)
2263 {
2264         struct btrfs_key key;
2265         struct btrfs_key found_key;
2266         struct btrfs_extent_ref *ref;
2267         struct extent_buffer *leaf;
2268         u64 ref_root;
2269         u64 ref_gen;
2270         u64 ref_objectid;
2271         u64 ref_offset;
2272         u32 nritems;
2273         u32 item_size;
2274         int ret = 0;
2275
2276         key.objectid = extent_key->objectid;
2277         key.type = BTRFS_EXTENT_REF_KEY;
2278         key.offset = 0;
2279
2280         while(1) {
2281                 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2282
2283                 BUG_ON(ret == 0);
2284
2285                 if (ret < 0)
2286                         goto out;
2287
2288                 ret = 0;
2289                 leaf = path->nodes[0];
2290                 nritems = btrfs_header_nritems(leaf);
2291                 if (path->slots[0] == nritems)
2292                         goto out;
2293
2294                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2295                 if (found_key.objectid != extent_key->objectid)
2296                         break;
2297
2298                 if (found_key.type != BTRFS_EXTENT_REF_KEY)
2299                         break;
2300
2301                 key.offset = found_key.offset + 1;
2302                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2303
2304                 ref = btrfs_item_ptr(leaf, path->slots[0],
2305                                      struct btrfs_extent_ref);
2306                 ref_root = btrfs_ref_root(leaf, ref);
2307                 ref_gen = btrfs_ref_generation(leaf, ref);
2308                 ref_objectid = btrfs_ref_objectid(leaf, ref);
2309                 ref_offset = btrfs_ref_offset(leaf, ref);
2310                 btrfs_release_path(extent_root, path);
2311
2312                 ret = relocate_one_reference(extent_root, path,
2313                                              extent_key, ref_root, ref_gen,
2314                                              ref_objectid, ref_offset);
2315                 if (ret)
2316                         goto out;
2317         }
2318         ret = 0;
2319 out:
2320         btrfs_release_path(extent_root, path);
2321         return ret;
2322 }
2323
2324 static int find_overlapping_extent(struct btrfs_root *root,
2325                                    struct btrfs_path *path, u64 new_size)
2326 {
2327         struct btrfs_key found_key;
2328         struct extent_buffer *leaf;
2329         int ret;
2330
2331         while(1) {
2332                 if (path->slots[0] == 0) {
2333                         ret = btrfs_prev_leaf(root, path);
2334                         if (ret == 1) {
2335                                 return 1;
2336                         }
2337                         if (ret < 0)
2338                                 return ret;
2339                 } else {
2340                         path->slots[0]--;
2341                 }
2342                 leaf = path->nodes[0];
2343                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2344                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY) {
2345                         if (found_key.objectid + found_key.offset > new_size)
2346                                 return 0;
2347                         else
2348                                 return 1;
2349                 }
2350         }
2351         return 1;
2352 }
2353
2354 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
2355 {
2356         struct btrfs_trans_handle *trans;
2357         struct btrfs_root *tree_root = root->fs_info->tree_root;
2358         struct btrfs_path *path;
2359         u64 cur_byte;
2360         u64 total_found;
2361         u64 ptr;
2362         struct btrfs_fs_info *info = root->fs_info;
2363         struct extent_map_tree *block_group_cache;
2364         struct btrfs_key key;
2365         struct btrfs_key found_key = { 0, 0, 0 };
2366         struct extent_buffer *leaf;
2367         u32 nritems;
2368         int ret;
2369         int slot;
2370
2371         btrfs_set_super_total_bytes(&info->super_copy, new_size);
2372         block_group_cache = &info->block_group_cache;
2373         path = btrfs_alloc_path();
2374         root = root->fs_info->extent_root;
2375         path->reada = 2;
2376
2377 again:
2378         total_found = 0;
2379         key.objectid = new_size;
2380         cur_byte = key.objectid;
2381         key.offset = 0;
2382         key.type = 0;
2383         while(1) {
2384                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2385                 if (ret < 0)
2386                         goto out;
2387 next:
2388                 leaf = path->nodes[0];
2389                 if (key.objectid == new_size - 1) {
2390                         ret = find_overlapping_extent(root, path, new_size);
2391                         if (ret != 0) {
2392                                 btrfs_release_path(root, path);
2393                                 ret = btrfs_search_slot(NULL, root, &key,
2394                                                         path, 0, 0);
2395                                 if (ret < 0)
2396                                         goto out;
2397                         }
2398                 }
2399                 nritems = btrfs_header_nritems(leaf);
2400                 ret = 0;
2401                 slot = path->slots[0];
2402                 if (slot < nritems)
2403                         btrfs_item_key_to_cpu(leaf, &found_key, slot);
2404                 if (slot == nritems ||
2405                     btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY) {
2406                         path->slots[0]++;
2407                         if (path->slots[0] >= nritems) {
2408                                 ret = btrfs_next_leaf(root, path);
2409                                 if (ret < 0)
2410                                         goto out;
2411                                 if (ret == 1) {
2412                                         ret = 0;
2413                                         break;
2414                                 }
2415                         }
2416                         goto next;
2417                 }
2418                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2419                 if (found_key.objectid + found_key.offset <= cur_byte)
2420                         continue;
2421                 total_found++;
2422                 cur_byte = found_key.objectid + found_key.offset;
2423                 key.objectid = cur_byte;
2424                 btrfs_release_path(root, path);
2425                 ret = relocate_one_extent(root, path, &found_key);
2426         }
2427
2428         btrfs_release_path(root, path);
2429
2430         if (total_found > 0) {
2431                 trans = btrfs_start_transaction(tree_root, 1);
2432                 btrfs_commit_transaction(trans, tree_root);
2433
2434                 mutex_unlock(&root->fs_info->fs_mutex);
2435                 btrfs_clean_old_snapshots(tree_root);
2436                 mutex_lock(&root->fs_info->fs_mutex);
2437
2438                 trans = btrfs_start_transaction(tree_root, 1);
2439                 btrfs_commit_transaction(trans, tree_root);
2440                 goto again;
2441         }
2442
2443         trans = btrfs_start_transaction(root, 1);
2444         key.objectid = new_size;
2445         key.offset = 0;
2446         key.type = 0;
2447         while(1) {
2448                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2449                 if (ret < 0)
2450                         goto out;
2451 bg_next:
2452                 leaf = path->nodes[0];
2453                 nritems = btrfs_header_nritems(leaf);
2454                 ret = 0;
2455                 slot = path->slots[0];
2456                 if (slot < nritems)
2457                         btrfs_item_key_to_cpu(leaf, &found_key, slot);
2458                 if (slot == nritems ||
2459                     btrfs_key_type(&found_key) != BTRFS_BLOCK_GROUP_ITEM_KEY) {
2460                         if (slot < nritems) {
2461                                 printk("shrinker found key %Lu %u %Lu\n",
2462                                        found_key.objectid, found_key.type,
2463                                        found_key.offset);
2464                                 path->slots[0]++;
2465                         }
2466                         if (path->slots[0] >= nritems) {
2467                                 ret = btrfs_next_leaf(root, path);
2468                                 if (ret < 0)
2469                                         break;
2470                                 if (ret == 1) {
2471                                         ret = 0;
2472                                         break;
2473                                 }
2474                         }
2475                         goto bg_next;
2476                 }
2477                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2478                 ret = get_state_private(&info->block_group_cache,
2479                                         found_key.objectid, &ptr);
2480                 if (!ret)
2481                         kfree((void *)(unsigned long)ptr);
2482
2483                 clear_extent_bits(&info->block_group_cache, found_key.objectid,
2484                                   found_key.objectid + found_key.offset - 1,
2485                                   (unsigned int)-1, GFP_NOFS);
2486
2487                 key.objectid = found_key.objectid + 1;
2488                 btrfs_del_item(trans, root, path);
2489                 btrfs_release_path(root, path);
2490         }
2491         clear_extent_dirty(&info->free_space_cache, new_size, (u64)-1,
2492                            GFP_NOFS);
2493         btrfs_commit_transaction(trans, root);
2494 out:
2495         btrfs_free_path(path);
2496         return ret;
2497 }
2498
2499 int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans,
2500                            struct btrfs_root *root, u64 new_size)
2501 {
2502         struct btrfs_path *path;
2503         u64 nr = 0;
2504         u64 cur_byte;
2505         u64 old_size;
2506         struct btrfs_block_group_cache *cache;
2507         struct btrfs_block_group_item *item;
2508         struct btrfs_fs_info *info = root->fs_info;
2509         struct extent_map_tree *block_group_cache;
2510         struct btrfs_key key;
2511         struct extent_buffer *leaf;
2512         int ret;
2513         int bit;
2514
2515         old_size = btrfs_super_total_bytes(&info->super_copy);
2516         block_group_cache = &info->block_group_cache;
2517
2518         root = info->extent_root;
2519
2520         cache = btrfs_lookup_block_group(root->fs_info, old_size - 1);
2521
2522         cur_byte = cache->key.objectid + cache->key.offset;
2523         if (cur_byte >= new_size)
2524                 goto set_size;
2525
2526         key.offset = BTRFS_BLOCK_GROUP_SIZE;
2527         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
2528
2529         path = btrfs_alloc_path();
2530         if (!path)
2531                 return -ENOMEM;
2532
2533         while(cur_byte < new_size) {
2534                 key.objectid = cur_byte;
2535                 ret = btrfs_insert_empty_item(trans, root, path, &key,
2536                                         sizeof(struct btrfs_block_group_item));
2537                 BUG_ON(ret);
2538                 leaf = path->nodes[0];
2539                 item = btrfs_item_ptr(leaf, path->slots[0],
2540                                       struct btrfs_block_group_item);
2541
2542                 btrfs_set_disk_block_group_used(leaf, item, 0);
2543                 if (nr % 3) {
2544                         btrfs_set_disk_block_group_flags(leaf, item,
2545                                                  BTRFS_BLOCK_GROUP_DATA);
2546                 } else {
2547                         btrfs_set_disk_block_group_flags(leaf, item, 0);
2548                 }
2549                 nr++;
2550
2551                 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2552                 BUG_ON(!cache);
2553
2554                 read_extent_buffer(leaf, &cache->item, (unsigned long)item,
2555                                    sizeof(cache->item));
2556
2557                 memcpy(&cache->key, &key, sizeof(key));
2558                 cache->cached = 0;
2559                 cache->pinned = 0;
2560                 cur_byte = key.objectid + key.offset;
2561                 btrfs_release_path(root, path);
2562
2563                 if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) {
2564                         bit = BLOCK_GROUP_DATA;
2565                         cache->data = BTRFS_BLOCK_GROUP_DATA;
2566                 } else {
2567                         bit = BLOCK_GROUP_METADATA;
2568                         cache->data = 0;
2569                 }
2570
2571                 /* use EXTENT_LOCKED to prevent merging */
2572                 set_extent_bits(block_group_cache, key.objectid,
2573                                 key.objectid + key.offset - 1,
2574                                 bit | EXTENT_LOCKED, GFP_NOFS);
2575                 set_state_private(block_group_cache, key.objectid,
2576                                   (unsigned long)cache);
2577         }
2578         btrfs_free_path(path);
2579 set_size:
2580         btrfs_set_super_total_bytes(&info->super_copy, new_size);
2581         return 0;
2582 }
2583
2584 int btrfs_read_block_groups(struct btrfs_root *root)
2585 {
2586         struct btrfs_path *path;
2587         int ret;
2588         int err = 0;
2589         int bit;
2590         struct btrfs_block_group_cache *cache;
2591         struct btrfs_fs_info *info = root->fs_info;
2592         struct extent_map_tree *block_group_cache;
2593         struct btrfs_key key;
2594         struct btrfs_key found_key;
2595         struct extent_buffer *leaf;
2596
2597         block_group_cache = &info->block_group_cache;
2598
2599         root = info->extent_root;
2600         key.objectid = 0;
2601         key.offset = BTRFS_BLOCK_GROUP_SIZE;
2602         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
2603
2604         path = btrfs_alloc_path();
2605         if (!path)
2606                 return -ENOMEM;
2607
2608         while(1) {
2609                 ret = btrfs_search_slot(NULL, info->extent_root,
2610                                         &key, path, 0, 0);
2611                 if (ret != 0) {
2612                         err = ret;
2613                         break;
2614                 }
2615                 leaf = path->nodes[0];
2616                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2617                 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2618                 if (!cache) {
2619                         err = -1;
2620                         break;
2621                 }
2622
2623                 read_extent_buffer(leaf, &cache->item,
2624                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
2625                                    sizeof(cache->item));
2626                 memcpy(&cache->key, &found_key, sizeof(found_key));
2627                 cache->cached = 0;
2628                 cache->pinned = 0;
2629                 key.objectid = found_key.objectid + found_key.offset;
2630                 btrfs_release_path(root, path);
2631
2632                 if (cache->item.flags & BTRFS_BLOCK_GROUP_MIXED) {
2633                         bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA;
2634                         cache->data = BTRFS_BLOCK_GROUP_MIXED;
2635                 } else if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) {
2636                         bit = BLOCK_GROUP_DATA;
2637                         cache->data = BTRFS_BLOCK_GROUP_DATA;
2638                 } else {
2639                         bit = BLOCK_GROUP_METADATA;
2640                         cache->data = 0;
2641                 }
2642
2643                 /* use EXTENT_LOCKED to prevent merging */
2644                 set_extent_bits(block_group_cache, found_key.objectid,
2645                                 found_key.objectid + found_key.offset - 1,
2646                                 bit | EXTENT_LOCKED, GFP_NOFS);
2647                 set_state_private(block_group_cache, found_key.objectid,
2648                                   (unsigned long)cache);
2649
2650                 if (key.objectid >=
2651                     btrfs_super_total_bytes(&info->super_copy))
2652                         break;
2653         }
2654
2655         btrfs_free_path(path);
2656         return 0;
2657 }