Btrfs: Chunk relocation fine tuning, and add a few printks to show progress
[pandora-kernel.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include "hash.h"
22 #include "crc32c.h"
23 #include "ctree.h"
24 #include "disk-io.h"
25 #include "print-tree.h"
26 #include "transaction.h"
27 #include "volumes.h"
28
29 #define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
30 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
31 #define BLOCK_GROUP_SYSTEM   EXTENT_NEW
32
33 #define BLOCK_GROUP_DIRTY EXTENT_DIRTY
34
35 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
36                                  btrfs_root *extent_root);
37 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
38                                btrfs_root *extent_root);
39
40
41 static int cache_block_group(struct btrfs_root *root,
42                              struct btrfs_block_group_cache *block_group)
43 {
44         struct btrfs_path *path;
45         int ret;
46         struct btrfs_key key;
47         struct extent_buffer *leaf;
48         struct extent_io_tree *free_space_cache;
49         int slot;
50         u64 last = 0;
51         u64 hole_size;
52         u64 first_free;
53         int found = 0;
54
55         if (!block_group)
56                 return 0;
57
58         root = root->fs_info->extent_root;
59         free_space_cache = &root->fs_info->free_space_cache;
60
61         if (block_group->cached)
62                 return 0;
63
64         path = btrfs_alloc_path();
65         if (!path)
66                 return -ENOMEM;
67
68         path->reada = 2;
69         first_free = block_group->key.objectid;
70         key.objectid = block_group->key.objectid;
71         key.offset = 0;
72         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
73         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
74         if (ret < 0)
75                 return ret;
76         ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
77         if (ret < 0)
78                 return ret;
79         if (ret == 0) {
80                 leaf = path->nodes[0];
81                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
82                 if (key.objectid + key.offset > first_free)
83                         first_free = key.objectid + key.offset;
84         }
85         while(1) {
86                 leaf = path->nodes[0];
87                 slot = path->slots[0];
88                 if (slot >= btrfs_header_nritems(leaf)) {
89                         ret = btrfs_next_leaf(root, path);
90                         if (ret < 0)
91                                 goto err;
92                         if (ret == 0) {
93                                 continue;
94                         } else {
95                                 break;
96                         }
97                 }
98                 btrfs_item_key_to_cpu(leaf, &key, slot);
99                 if (key.objectid < block_group->key.objectid) {
100                         goto next;
101                 }
102                 if (key.objectid >= block_group->key.objectid +
103                     block_group->key.offset) {
104                         break;
105                 }
106
107                 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
108                         if (!found) {
109                                 last = first_free;
110                                 found = 1;
111                         }
112                         if (key.objectid > last) {
113                                 hole_size = key.objectid - last;
114                                 set_extent_dirty(free_space_cache, last,
115                                                  last + hole_size - 1,
116                                                  GFP_NOFS);
117                         }
118                         last = key.objectid + key.offset;
119                 }
120 next:
121                 path->slots[0]++;
122         }
123
124         if (!found)
125                 last = first_free;
126         if (block_group->key.objectid +
127             block_group->key.offset > last) {
128                 hole_size = block_group->key.objectid +
129                         block_group->key.offset - last;
130                 set_extent_dirty(free_space_cache, last,
131                                  last + hole_size - 1, GFP_NOFS);
132         }
133         block_group->cached = 1;
134 err:
135         btrfs_free_path(path);
136         return 0;
137 }
138
139 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
140                                                          btrfs_fs_info *info,
141                                                          u64 bytenr)
142 {
143         struct extent_io_tree *block_group_cache;
144         struct btrfs_block_group_cache *block_group = NULL;
145         u64 ptr;
146         u64 start;
147         u64 end;
148         int ret;
149
150         bytenr = max_t(u64, bytenr,
151                        BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
152         block_group_cache = &info->block_group_cache;
153         ret = find_first_extent_bit(block_group_cache,
154                                     bytenr, &start, &end,
155                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
156                                     BLOCK_GROUP_SYSTEM);
157         if (ret) {
158                 return NULL;
159         }
160         ret = get_state_private(block_group_cache, start, &ptr);
161         if (ret)
162                 return NULL;
163
164         block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
165         if (block_group->key.objectid <= bytenr && bytenr <
166             block_group->key.objectid + block_group->key.offset)
167                 return block_group;
168         return NULL;
169 }
170
171 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
172 {
173         return (cache->flags & bits) == bits;
174 }
175
176 static int noinline find_search_start(struct btrfs_root *root,
177                               struct btrfs_block_group_cache **cache_ret,
178                               u64 *start_ret, int num, int data)
179 {
180         int ret;
181         struct btrfs_block_group_cache *cache = *cache_ret;
182         struct extent_io_tree *free_space_cache;
183         struct extent_state *state;
184         u64 last;
185         u64 start = 0;
186         u64 cache_miss = 0;
187         u64 total_fs_bytes;
188         u64 search_start = *start_ret;
189         int wrapped = 0;
190
191         if (!cache)
192                 goto out;
193
194         total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
195         free_space_cache = &root->fs_info->free_space_cache;
196
197 again:
198         ret = cache_block_group(root, cache);
199         if (ret)
200                 goto out;
201
202         last = max(search_start, cache->key.objectid);
203         if (!block_group_bits(cache, data) || cache->ro) {
204                 goto new_group;
205         }
206
207         spin_lock_irq(&free_space_cache->lock);
208         state = find_first_extent_bit_state(free_space_cache, last, EXTENT_DIRTY);
209         while(1) {
210                 if (!state) {
211                         if (!cache_miss)
212                                 cache_miss = last;
213                         spin_unlock_irq(&free_space_cache->lock);
214                         goto new_group;
215                 }
216
217                 start = max(last, state->start);
218                 last = state->end + 1;
219                 if (last - start < num) {
220                         if (last == cache->key.objectid + cache->key.offset)
221                                 cache_miss = start;
222                         do {
223                                 state = extent_state_next(state);
224                         } while(state && !(state->state & EXTENT_DIRTY));
225                         continue;
226                 }
227                 spin_unlock_irq(&free_space_cache->lock);
228                 if (cache->ro)
229                         goto new_group;
230                 if (start + num > cache->key.objectid + cache->key.offset)
231                         goto new_group;
232                 if (start + num  > total_fs_bytes)
233                         goto new_group;
234                 if (!block_group_bits(cache, data)) {
235                         printk("block group bits don't match %Lu %d\n", cache->flags, data);
236                 }
237                 *start_ret = start;
238                 return 0;
239         }
240 out:
241         cache = btrfs_lookup_block_group(root->fs_info, search_start);
242         if (!cache) {
243                 printk("Unable to find block group for %Lu\n", search_start);
244                 WARN_ON(1);
245         }
246         return -ENOSPC;
247
248 new_group:
249         last = cache->key.objectid + cache->key.offset;
250 wrapped:
251         cache = btrfs_lookup_block_group(root->fs_info, last);
252         if (!cache || cache->key.objectid >= total_fs_bytes) {
253 no_cache:
254                 if (!wrapped) {
255                         wrapped = 1;
256                         last = search_start;
257                         goto wrapped;
258                 }
259                 goto out;
260         }
261         if (cache_miss && !cache->cached) {
262                 cache_block_group(root, cache);
263                 last = cache_miss;
264                 cache = btrfs_lookup_block_group(root->fs_info, last);
265         }
266         cache = btrfs_find_block_group(root, cache, last, data, 0);
267         if (!cache)
268                 goto no_cache;
269         *cache_ret = cache;
270         cache_miss = 0;
271         goto again;
272 }
273
274 static u64 div_factor(u64 num, int factor)
275 {
276         if (factor == 10)
277                 return num;
278         num *= factor;
279         do_div(num, 10);
280         return num;
281 }
282
283 static int block_group_state_bits(u64 flags)
284 {
285         int bits = 0;
286         if (flags & BTRFS_BLOCK_GROUP_DATA)
287                 bits |= BLOCK_GROUP_DATA;
288         if (flags & BTRFS_BLOCK_GROUP_METADATA)
289                 bits |= BLOCK_GROUP_METADATA;
290         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
291                 bits |= BLOCK_GROUP_SYSTEM;
292         return bits;
293 }
294
295 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
296                                                  struct btrfs_block_group_cache
297                                                  *hint, u64 search_start,
298                                                  int data, int owner)
299 {
300         struct btrfs_block_group_cache *cache;
301         struct extent_io_tree *block_group_cache;
302         struct btrfs_block_group_cache *found_group = NULL;
303         struct btrfs_fs_info *info = root->fs_info;
304         u64 used;
305         u64 last = 0;
306         u64 hint_last;
307         u64 start;
308         u64 end;
309         u64 free_check;
310         u64 ptr;
311         u64 total_fs_bytes;
312         int bit;
313         int ret;
314         int full_search = 0;
315         int factor = 10;
316
317         block_group_cache = &info->block_group_cache;
318         total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
319
320         if (data & BTRFS_BLOCK_GROUP_METADATA)
321                 factor = 9;
322
323         bit = block_group_state_bits(data);
324
325         if (search_start && search_start < total_fs_bytes) {
326                 struct btrfs_block_group_cache *shint;
327                 shint = btrfs_lookup_block_group(info, search_start);
328                 if (shint && block_group_bits(shint, data) && !shint->ro) {
329                         used = btrfs_block_group_used(&shint->item);
330                         if (used + shint->pinned <
331                             div_factor(shint->key.offset, factor)) {
332                                 return shint;
333                         }
334                 }
335         }
336         if (hint && !hint->ro && block_group_bits(hint, data) &&
337             hint->key.objectid < total_fs_bytes) {
338                 used = btrfs_block_group_used(&hint->item);
339                 if (used + hint->pinned <
340                     div_factor(hint->key.offset, factor)) {
341                         return hint;
342                 }
343                 last = hint->key.objectid + hint->key.offset;
344                 hint_last = last;
345         } else {
346                 if (hint)
347                         hint_last = max(hint->key.objectid, search_start);
348                 else
349                         hint_last = search_start;
350
351                 if (hint_last >= total_fs_bytes)
352                         hint_last = search_start;
353                 last = hint_last;
354         }
355 again:
356         while(1) {
357                 ret = find_first_extent_bit(block_group_cache, last,
358                                             &start, &end, bit);
359                 if (ret)
360                         break;
361
362                 ret = get_state_private(block_group_cache, start, &ptr);
363                 if (ret)
364                         break;
365
366                 cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
367                 last = cache->key.objectid + cache->key.offset;
368                 used = btrfs_block_group_used(&cache->item);
369
370                 if (cache->key.objectid > total_fs_bytes)
371                         break;
372
373                 if (!cache->ro && block_group_bits(cache, data)) {
374                         if (full_search)
375                                 free_check = cache->key.offset;
376                         else
377                                 free_check = div_factor(cache->key.offset,
378                                                         factor);
379
380                         if (used + cache->pinned < free_check) {
381                                 found_group = cache;
382                                 goto found;
383                         }
384                 }
385                 cond_resched();
386         }
387         if (!full_search) {
388                 last = search_start;
389                 full_search = 1;
390                 goto again;
391         }
392 found:
393         return found_group;
394 }
395
396 static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
397                            u64 owner, u64 owner_offset)
398 {
399         u32 high_crc = ~(u32)0;
400         u32 low_crc = ~(u32)0;
401         __le64 lenum;
402         lenum = cpu_to_le64(root_objectid);
403         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
404         lenum = cpu_to_le64(ref_generation);
405         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
406         if (owner >= BTRFS_FIRST_FREE_OBJECTID) {
407                 lenum = cpu_to_le64(owner);
408                 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
409                 lenum = cpu_to_le64(owner_offset);
410                 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
411         }
412         return ((u64)high_crc << 32) | (u64)low_crc;
413 }
414
415 static int match_extent_ref(struct extent_buffer *leaf,
416                             struct btrfs_extent_ref *disk_ref,
417                             struct btrfs_extent_ref *cpu_ref)
418 {
419         int ret;
420         int len;
421
422         if (cpu_ref->objectid)
423                 len = sizeof(*cpu_ref);
424         else
425                 len = 2 * sizeof(u64);
426         ret = memcmp_extent_buffer(leaf, cpu_ref, (unsigned long)disk_ref,
427                                    len);
428         return ret == 0;
429 }
430
431 static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
432                                           struct btrfs_root *root,
433                                           struct btrfs_path *path, u64 bytenr,
434                                           u64 root_objectid,
435                                           u64 ref_generation, u64 owner,
436                                           u64 owner_offset, int del)
437 {
438         u64 hash;
439         struct btrfs_key key;
440         struct btrfs_key found_key;
441         struct btrfs_extent_ref ref;
442         struct extent_buffer *leaf;
443         struct btrfs_extent_ref *disk_ref;
444         int ret;
445         int ret2;
446
447         btrfs_set_stack_ref_root(&ref, root_objectid);
448         btrfs_set_stack_ref_generation(&ref, ref_generation);
449         btrfs_set_stack_ref_objectid(&ref, owner);
450         btrfs_set_stack_ref_offset(&ref, owner_offset);
451
452         hash = hash_extent_ref(root_objectid, ref_generation, owner,
453                                owner_offset);
454         key.offset = hash;
455         key.objectid = bytenr;
456         key.type = BTRFS_EXTENT_REF_KEY;
457
458         while (1) {
459                 ret = btrfs_search_slot(trans, root, &key, path,
460                                         del ? -1 : 0, del);
461                 if (ret < 0)
462                         goto out;
463                 leaf = path->nodes[0];
464                 if (ret != 0) {
465                         u32 nritems = btrfs_header_nritems(leaf);
466                         if (path->slots[0] >= nritems) {
467                                 ret2 = btrfs_next_leaf(root, path);
468                                 if (ret2)
469                                         goto out;
470                                 leaf = path->nodes[0];
471                         }
472                         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
473                         if (found_key.objectid != bytenr ||
474                             found_key.type != BTRFS_EXTENT_REF_KEY)
475                                 goto out;
476                         key.offset = found_key.offset;
477                         if (del) {
478                                 btrfs_release_path(root, path);
479                                 continue;
480                         }
481                 }
482                 disk_ref = btrfs_item_ptr(path->nodes[0],
483                                           path->slots[0],
484                                           struct btrfs_extent_ref);
485                 if (match_extent_ref(path->nodes[0], disk_ref, &ref)) {
486                         ret = 0;
487                         goto out;
488                 }
489                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
490                 key.offset = found_key.offset + 1;
491                 btrfs_release_path(root, path);
492         }
493 out:
494         return ret;
495 }
496
497 /*
498  * Back reference rules.  Back refs have three main goals:
499  *
500  * 1) differentiate between all holders of references to an extent so that
501  *    when a reference is dropped we can make sure it was a valid reference
502  *    before freeing the extent.
503  *
504  * 2) Provide enough information to quickly find the holders of an extent
505  *    if we notice a given block is corrupted or bad.
506  *
507  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
508  *    maintenance.  This is actually the same as #2, but with a slightly
509  *    different use case.
510  *
511  * File extents can be referenced by:
512  *
513  * - multiple snapshots, subvolumes, or different generations in one subvol
514  * - different files inside a single subvolume (in theory, not implemented yet)
515  * - different offsets inside a file (bookend extents in file.c)
516  *
517  * The extent ref structure has fields for:
518  *
519  * - Objectid of the subvolume root
520  * - Generation number of the tree holding the reference
521  * - objectid of the file holding the reference
522  * - offset in the file corresponding to the key holding the reference
523  *
524  * When a file extent is allocated the fields are filled in:
525  *     (root_key.objectid, trans->transid, inode objectid, offset in file)
526  *
527  * When a leaf is cow'd new references are added for every file extent found
528  * in the leaf.  It looks the same as the create case, but trans->transid
529  * will be different when the block is cow'd.
530  *
531  *     (root_key.objectid, trans->transid, inode objectid, offset in file)
532  *
533  * When a file extent is removed either during snapshot deletion or file
534  * truncation, the corresponding back reference is found
535  * by searching for:
536  *
537  *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
538  *      inode objectid, offset in file)
539  *
540  * Btree extents can be referenced by:
541  *
542  * - Different subvolumes
543  * - Different generations of the same subvolume
544  *
545  * Storing sufficient information for a full reverse mapping of a btree
546  * block would require storing the lowest key of the block in the backref,
547  * and it would require updating that lowest key either before write out or
548  * every time it changed.  Instead, the objectid of the lowest key is stored
549  * along with the level of the tree block.  This provides a hint
550  * about where in the btree the block can be found.  Searches through the
551  * btree only need to look for a pointer to that block, so they stop one
552  * level higher than the level recorded in the backref.
553  *
554  * Some btrees do not do reference counting on their extents.  These
555  * include the extent tree and the tree of tree roots.  Backrefs for these
556  * trees always have a generation of zero.
557  *
558  * When a tree block is created, back references are inserted:
559  *
560  * (root->root_key.objectid, trans->transid or zero, level, lowest_key_objectid)
561  *
562  * When a tree block is cow'd in a reference counted root,
563  * new back references are added for all the blocks it points to.
564  * These are of the form (trans->transid will have increased since creation):
565  *
566  * (root->root_key.objectid, trans->transid, level, lowest_key_objectid)
567  *
568  * Because the lowest_key_objectid and the level are just hints
569  * they are not used when backrefs are deleted.  When a backref is deleted:
570  *
571  * if backref was for a tree root:
572  *     root_objectid = root->root_key.objectid
573  * else
574  *     root_objectid = btrfs_header_owner(parent)
575  *
576  * (root_objectid, btrfs_header_generation(parent) or zero, 0, 0)
577  *
578  * Back Reference Key hashing:
579  *
580  * Back references have four fields, each 64 bits long.  Unfortunately,
581  * This is hashed into a single 64 bit number and placed into the key offset.
582  * The key objectid corresponds to the first byte in the extent, and the
583  * key type is set to BTRFS_EXTENT_REF_KEY
584  */
585 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
586                                  struct btrfs_root *root,
587                                  struct btrfs_path *path, u64 bytenr,
588                                  u64 root_objectid, u64 ref_generation,
589                                  u64 owner, u64 owner_offset)
590 {
591         u64 hash;
592         struct btrfs_key key;
593         struct btrfs_extent_ref ref;
594         struct btrfs_extent_ref *disk_ref;
595         int ret;
596
597         btrfs_set_stack_ref_root(&ref, root_objectid);
598         btrfs_set_stack_ref_generation(&ref, ref_generation);
599         btrfs_set_stack_ref_objectid(&ref, owner);
600         btrfs_set_stack_ref_offset(&ref, owner_offset);
601
602         hash = hash_extent_ref(root_objectid, ref_generation, owner,
603                                owner_offset);
604         key.offset = hash;
605         key.objectid = bytenr;
606         key.type = BTRFS_EXTENT_REF_KEY;
607
608         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(ref));
609         while (ret == -EEXIST) {
610                 disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
611                                           struct btrfs_extent_ref);
612                 if (match_extent_ref(path->nodes[0], disk_ref, &ref))
613                         goto out;
614                 key.offset++;
615                 btrfs_release_path(root, path);
616                 ret = btrfs_insert_empty_item(trans, root, path, &key,
617                                               sizeof(ref));
618         }
619         if (ret)
620                 goto out;
621         disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
622                                   struct btrfs_extent_ref);
623         write_extent_buffer(path->nodes[0], &ref, (unsigned long)disk_ref,
624                             sizeof(ref));
625         btrfs_mark_buffer_dirty(path->nodes[0]);
626 out:
627         btrfs_release_path(root, path);
628         return ret;
629 }
630
631 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
632                                 struct btrfs_root *root,
633                                 u64 bytenr, u64 num_bytes,
634                                 u64 root_objectid, u64 ref_generation,
635                                 u64 owner, u64 owner_offset)
636 {
637         struct btrfs_path *path;
638         int ret;
639         struct btrfs_key key;
640         struct extent_buffer *l;
641         struct btrfs_extent_item *item;
642         u32 refs;
643
644         WARN_ON(num_bytes < root->sectorsize);
645         path = btrfs_alloc_path();
646         if (!path)
647                 return -ENOMEM;
648
649         path->reada = 1;
650         key.objectid = bytenr;
651         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
652         key.offset = num_bytes;
653         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
654                                 0, 1);
655         if (ret < 0)
656                 return ret;
657         if (ret != 0) {
658                 BUG();
659         }
660         BUG_ON(ret != 0);
661         l = path->nodes[0];
662         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
663         refs = btrfs_extent_refs(l, item);
664         btrfs_set_extent_refs(l, item, refs + 1);
665         btrfs_mark_buffer_dirty(path->nodes[0]);
666
667         btrfs_release_path(root->fs_info->extent_root, path);
668
669         path->reada = 1;
670         ret = btrfs_insert_extent_backref(trans, root->fs_info->extent_root,
671                                           path, bytenr, root_objectid,
672                                           ref_generation, owner, owner_offset);
673         BUG_ON(ret);
674         finish_current_insert(trans, root->fs_info->extent_root);
675         del_pending_extents(trans, root->fs_info->extent_root);
676
677         btrfs_free_path(path);
678         return 0;
679 }
680
681 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
682                          struct btrfs_root *root)
683 {
684         finish_current_insert(trans, root->fs_info->extent_root);
685         del_pending_extents(trans, root->fs_info->extent_root);
686         return 0;
687 }
688
689 static int lookup_extent_ref(struct btrfs_trans_handle *trans,
690                              struct btrfs_root *root, u64 bytenr,
691                              u64 num_bytes, u32 *refs)
692 {
693         struct btrfs_path *path;
694         int ret;
695         struct btrfs_key key;
696         struct extent_buffer *l;
697         struct btrfs_extent_item *item;
698
699         WARN_ON(num_bytes < root->sectorsize);
700         path = btrfs_alloc_path();
701         path->reada = 1;
702         key.objectid = bytenr;
703         key.offset = num_bytes;
704         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
705         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
706                                 0, 0);
707         if (ret < 0)
708                 goto out;
709         if (ret != 0) {
710                 btrfs_print_leaf(root, path->nodes[0]);
711                 printk("failed to find block number %Lu\n", bytenr);
712                 BUG();
713         }
714         l = path->nodes[0];
715         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
716         *refs = btrfs_extent_refs(l, item);
717 out:
718         btrfs_free_path(path);
719         return 0;
720 }
721
722 u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
723                                   struct btrfs_path *count_path,
724                                   u64 expected_owner,
725                                   u64 first_extent)
726 {
727         struct btrfs_root *extent_root = root->fs_info->extent_root;
728         struct btrfs_path *path;
729         u64 bytenr;
730         u64 found_objectid;
731         u64 found_owner;
732         u64 root_objectid = root->root_key.objectid;
733         u32 total_count = 0;
734         u32 extent_refs;
735         u32 cur_count;
736         u32 nritems;
737         int ret;
738         struct btrfs_key key;
739         struct btrfs_key found_key;
740         struct extent_buffer *l;
741         struct btrfs_extent_item *item;
742         struct btrfs_extent_ref *ref_item;
743         int level = -1;
744
745         path = btrfs_alloc_path();
746 again:
747         if (level == -1)
748                 bytenr = first_extent;
749         else
750                 bytenr = count_path->nodes[level]->start;
751
752         cur_count = 0;
753         key.objectid = bytenr;
754         key.offset = 0;
755
756         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
757         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
758         if (ret < 0)
759                 goto out;
760         BUG_ON(ret == 0);
761
762         l = path->nodes[0];
763         btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
764
765         if (found_key.objectid != bytenr ||
766             found_key.type != BTRFS_EXTENT_ITEM_KEY) {
767                 goto out;
768         }
769
770         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
771         extent_refs = btrfs_extent_refs(l, item);
772         while (1) {
773                 l = path->nodes[0];
774                 nritems = btrfs_header_nritems(l);
775                 if (path->slots[0] >= nritems) {
776                         ret = btrfs_next_leaf(extent_root, path);
777                         if (ret == 0)
778                                 continue;
779                         break;
780                 }
781                 btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
782                 if (found_key.objectid != bytenr)
783                         break;
784
785                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
786                         path->slots[0]++;
787                         continue;
788                 }
789
790                 cur_count++;
791                 ref_item = btrfs_item_ptr(l, path->slots[0],
792                                           struct btrfs_extent_ref);
793                 found_objectid = btrfs_ref_root(l, ref_item);
794
795                 if (found_objectid != root_objectid) {
796                         total_count = 2;
797                         goto out;
798                 }
799                 if (level == -1) {
800                         found_owner = btrfs_ref_objectid(l, ref_item);
801                         if (found_owner != expected_owner) {
802                                 total_count = 2;
803                                 goto out;
804                         }
805                         /*
806                          * nasty.  we don't count a reference held by
807                          * the running transaction.  This allows nodatacow
808                          * to avoid cow most of the time
809                          */
810                         if (found_owner >= BTRFS_FIRST_FREE_OBJECTID &&
811                             btrfs_ref_generation(l, ref_item) ==
812                             root->fs_info->generation) {
813                                 extent_refs--;
814                         }
815                 }
816                 total_count = 1;
817                 path->slots[0]++;
818         }
819         /*
820          * if there is more than one reference against a data extent,
821          * we have to assume the other ref is another snapshot
822          */
823         if (level == -1 && extent_refs > 1) {
824                 total_count = 2;
825                 goto out;
826         }
827         if (cur_count == 0) {
828                 total_count = 0;
829                 goto out;
830         }
831         if (level >= 0 && root->node == count_path->nodes[level])
832                 goto out;
833         level++;
834         btrfs_release_path(root, path);
835         goto again;
836
837 out:
838         btrfs_free_path(path);
839         return total_count;
840 }
841 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
842                        struct btrfs_root *root, u64 owner_objectid)
843 {
844         u64 generation;
845         u64 key_objectid;
846         u64 level;
847         u32 nritems;
848         struct btrfs_disk_key disk_key;
849
850         level = btrfs_header_level(root->node);
851         generation = trans->transid;
852         nritems = btrfs_header_nritems(root->node);
853         if (nritems > 0) {
854                 if (level == 0)
855                         btrfs_item_key(root->node, &disk_key, 0);
856                 else
857                         btrfs_node_key(root->node, &disk_key, 0);
858                 key_objectid = btrfs_disk_key_objectid(&disk_key);
859         } else {
860                 key_objectid = 0;
861         }
862         return btrfs_inc_extent_ref(trans, root, root->node->start,
863                                     root->node->len, owner_objectid,
864                                     generation, level, key_objectid);
865 }
866
867 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
868                   struct extent_buffer *buf)
869 {
870         u64 bytenr;
871         u32 nritems;
872         struct btrfs_key key;
873         struct btrfs_file_extent_item *fi;
874         int i;
875         int level;
876         int ret;
877         int faili;
878
879         if (!root->ref_cows)
880                 return 0;
881
882         level = btrfs_header_level(buf);
883         nritems = btrfs_header_nritems(buf);
884         for (i = 0; i < nritems; i++) {
885                 if (level == 0) {
886                         u64 disk_bytenr;
887                         btrfs_item_key_to_cpu(buf, &key, i);
888                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
889                                 continue;
890                         fi = btrfs_item_ptr(buf, i,
891                                             struct btrfs_file_extent_item);
892                         if (btrfs_file_extent_type(buf, fi) ==
893                             BTRFS_FILE_EXTENT_INLINE)
894                                 continue;
895                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
896                         if (disk_bytenr == 0)
897                                 continue;
898                         ret = btrfs_inc_extent_ref(trans, root, disk_bytenr,
899                                     btrfs_file_extent_disk_num_bytes(buf, fi),
900                                     root->root_key.objectid, trans->transid,
901                                     key.objectid, key.offset);
902                         if (ret) {
903                                 faili = i;
904                                 goto fail;
905                         }
906                 } else {
907                         bytenr = btrfs_node_blockptr(buf, i);
908                         btrfs_node_key_to_cpu(buf, &key, i);
909                         ret = btrfs_inc_extent_ref(trans, root, bytenr,
910                                            btrfs_level_size(root, level - 1),
911                                            root->root_key.objectid,
912                                            trans->transid,
913                                            level - 1, key.objectid);
914                         if (ret) {
915                                 faili = i;
916                                 goto fail;
917                         }
918                 }
919         }
920         return 0;
921 fail:
922         WARN_ON(1);
923 #if 0
924         for (i =0; i < faili; i++) {
925                 if (level == 0) {
926                         u64 disk_bytenr;
927                         btrfs_item_key_to_cpu(buf, &key, i);
928                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
929                                 continue;
930                         fi = btrfs_item_ptr(buf, i,
931                                             struct btrfs_file_extent_item);
932                         if (btrfs_file_extent_type(buf, fi) ==
933                             BTRFS_FILE_EXTENT_INLINE)
934                                 continue;
935                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
936                         if (disk_bytenr == 0)
937                                 continue;
938                         err = btrfs_free_extent(trans, root, disk_bytenr,
939                                     btrfs_file_extent_disk_num_bytes(buf,
940                                                                       fi), 0);
941                         BUG_ON(err);
942                 } else {
943                         bytenr = btrfs_node_blockptr(buf, i);
944                         err = btrfs_free_extent(trans, root, bytenr,
945                                         btrfs_level_size(root, level - 1), 0);
946                         BUG_ON(err);
947                 }
948         }
949 #endif
950         return ret;
951 }
952
953 static int write_one_cache_group(struct btrfs_trans_handle *trans,
954                                  struct btrfs_root *root,
955                                  struct btrfs_path *path,
956                                  struct btrfs_block_group_cache *cache)
957 {
958         int ret;
959         int pending_ret;
960         struct btrfs_root *extent_root = root->fs_info->extent_root;
961         unsigned long bi;
962         struct extent_buffer *leaf;
963
964         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
965         if (ret < 0)
966                 goto fail;
967         BUG_ON(ret);
968
969         leaf = path->nodes[0];
970         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
971         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
972         btrfs_mark_buffer_dirty(leaf);
973         btrfs_release_path(extent_root, path);
974 fail:
975         finish_current_insert(trans, extent_root);
976         pending_ret = del_pending_extents(trans, extent_root);
977         if (ret)
978                 return ret;
979         if (pending_ret)
980                 return pending_ret;
981         return 0;
982
983 }
984
985 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
986                                    struct btrfs_root *root)
987 {
988         struct extent_io_tree *block_group_cache;
989         struct btrfs_block_group_cache *cache;
990         int ret;
991         int err = 0;
992         int werr = 0;
993         struct btrfs_path *path;
994         u64 last = 0;
995         u64 start;
996         u64 end;
997         u64 ptr;
998
999         block_group_cache = &root->fs_info->block_group_cache;
1000         path = btrfs_alloc_path();
1001         if (!path)
1002                 return -ENOMEM;
1003
1004         while(1) {
1005                 ret = find_first_extent_bit(block_group_cache, last,
1006                                             &start, &end, BLOCK_GROUP_DIRTY);
1007                 if (ret)
1008                         break;
1009
1010                 last = end + 1;
1011                 ret = get_state_private(block_group_cache, start, &ptr);
1012                 if (ret)
1013                         break;
1014                 cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
1015                 err = write_one_cache_group(trans, root,
1016                                             path, cache);
1017                 /*
1018                  * if we fail to write the cache group, we want
1019                  * to keep it marked dirty in hopes that a later
1020                  * write will work
1021                  */
1022                 if (err) {
1023                         werr = err;
1024                         continue;
1025                 }
1026                 clear_extent_bits(block_group_cache, start, end,
1027                                   BLOCK_GROUP_DIRTY, GFP_NOFS);
1028         }
1029         btrfs_free_path(path);
1030         return werr;
1031 }
1032
1033 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
1034                                                   u64 flags)
1035 {
1036         struct list_head *head = &info->space_info;
1037         struct list_head *cur;
1038         struct btrfs_space_info *found;
1039         list_for_each(cur, head) {
1040                 found = list_entry(cur, struct btrfs_space_info, list);
1041                 if (found->flags == flags)
1042                         return found;
1043         }
1044         return NULL;
1045
1046 }
1047
1048 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1049                              u64 total_bytes, u64 bytes_used,
1050                              struct btrfs_space_info **space_info)
1051 {
1052         struct btrfs_space_info *found;
1053
1054         found = __find_space_info(info, flags);
1055         if (found) {
1056                 found->total_bytes += total_bytes;
1057                 found->bytes_used += bytes_used;
1058                 found->full = 0;
1059                 WARN_ON(found->total_bytes < found->bytes_used);
1060                 *space_info = found;
1061                 return 0;
1062         }
1063         found = kmalloc(sizeof(*found), GFP_NOFS);
1064         if (!found)
1065                 return -ENOMEM;
1066
1067         list_add(&found->list, &info->space_info);
1068         found->flags = flags;
1069         found->total_bytes = total_bytes;
1070         found->bytes_used = bytes_used;
1071         found->bytes_pinned = 0;
1072         found->full = 0;
1073         *space_info = found;
1074         return 0;
1075 }
1076
1077 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1078 {
1079         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1080                                    BTRFS_BLOCK_GROUP_RAID1 |
1081                                    BTRFS_BLOCK_GROUP_RAID10 |
1082                                    BTRFS_BLOCK_GROUP_DUP);
1083         if (extra_flags) {
1084                 if (flags & BTRFS_BLOCK_GROUP_DATA)
1085                         fs_info->avail_data_alloc_bits |= extra_flags;
1086                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1087                         fs_info->avail_metadata_alloc_bits |= extra_flags;
1088                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1089                         fs_info->avail_system_alloc_bits |= extra_flags;
1090         }
1091 }
1092
1093 static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1094 {
1095         u64 num_devices = root->fs_info->fs_devices->num_devices;
1096
1097         if (num_devices == 1)
1098                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1099         if (num_devices < 4)
1100                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1101
1102         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1103             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1104                       BTRFS_BLOCK_GROUP_RAID10))) {
1105                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1106         }
1107
1108         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1109             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1110                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1111         }
1112
1113         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1114             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1115              (flags & BTRFS_BLOCK_GROUP_RAID10) |
1116              (flags & BTRFS_BLOCK_GROUP_DUP)))
1117                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1118         return flags;
1119 }
1120
1121 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1122                           struct btrfs_root *extent_root, u64 alloc_bytes,
1123                           u64 flags)
1124 {
1125         struct btrfs_space_info *space_info;
1126         u64 thresh;
1127         u64 start;
1128         u64 num_bytes;
1129         int ret;
1130
1131         flags = reduce_alloc_profile(extent_root, flags);
1132
1133         space_info = __find_space_info(extent_root->fs_info, flags);
1134         if (!space_info) {
1135                 ret = update_space_info(extent_root->fs_info, flags,
1136                                         0, 0, &space_info);
1137                 BUG_ON(ret);
1138         }
1139         BUG_ON(!space_info);
1140
1141         if (space_info->full)
1142                 return 0;
1143
1144         thresh = div_factor(space_info->total_bytes, 6);
1145         if ((space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
1146             thresh)
1147                 return 0;
1148
1149         ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
1150         if (ret == -ENOSPC) {
1151 printk("space info full %Lu\n", flags);
1152                 space_info->full = 1;
1153                 return 0;
1154         }
1155
1156         BUG_ON(ret);
1157
1158         ret = btrfs_make_block_group(trans, extent_root, 0, flags,
1159                      BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
1160         BUG_ON(ret);
1161
1162         return 0;
1163 }
1164
1165 static int update_block_group(struct btrfs_trans_handle *trans,
1166                               struct btrfs_root *root,
1167                               u64 bytenr, u64 num_bytes, int alloc,
1168                               int mark_free)
1169 {
1170         struct btrfs_block_group_cache *cache;
1171         struct btrfs_fs_info *info = root->fs_info;
1172         u64 total = num_bytes;
1173         u64 old_val;
1174         u64 byte_in_group;
1175         u64 start;
1176         u64 end;
1177
1178         while(total) {
1179                 cache = btrfs_lookup_block_group(info, bytenr);
1180                 if (!cache) {
1181                         return -1;
1182                 }
1183                 byte_in_group = bytenr - cache->key.objectid;
1184                 WARN_ON(byte_in_group > cache->key.offset);
1185                 start = cache->key.objectid;
1186                 end = start + cache->key.offset - 1;
1187                 set_extent_bits(&info->block_group_cache, start, end,
1188                                 BLOCK_GROUP_DIRTY, GFP_NOFS);
1189
1190                 old_val = btrfs_block_group_used(&cache->item);
1191                 num_bytes = min(total, cache->key.offset - byte_in_group);
1192                 if (alloc) {
1193                         old_val += num_bytes;
1194                         cache->space_info->bytes_used += num_bytes;
1195                 } else {
1196                         old_val -= num_bytes;
1197                         cache->space_info->bytes_used -= num_bytes;
1198                         if (mark_free) {
1199                                 set_extent_dirty(&info->free_space_cache,
1200                                                  bytenr, bytenr + num_bytes - 1,
1201                                                  GFP_NOFS);
1202                         }
1203                 }
1204                 btrfs_set_block_group_used(&cache->item, old_val);
1205                 total -= num_bytes;
1206                 bytenr += num_bytes;
1207         }
1208         return 0;
1209 }
1210
1211 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1212 {
1213         u64 start;
1214         u64 end;
1215         int ret;
1216         ret = find_first_extent_bit(&root->fs_info->block_group_cache,
1217                                     search_start, &start, &end,
1218                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
1219                                     BLOCK_GROUP_SYSTEM);
1220         if (ret)
1221                 return 0;
1222         return start;
1223 }
1224
1225
1226 static int update_pinned_extents(struct btrfs_root *root,
1227                                 u64 bytenr, u64 num, int pin)
1228 {
1229         u64 len;
1230         struct btrfs_block_group_cache *cache;
1231         struct btrfs_fs_info *fs_info = root->fs_info;
1232
1233         if (pin) {
1234                 set_extent_dirty(&fs_info->pinned_extents,
1235                                 bytenr, bytenr + num - 1, GFP_NOFS);
1236         } else {
1237                 clear_extent_dirty(&fs_info->pinned_extents,
1238                                 bytenr, bytenr + num - 1, GFP_NOFS);
1239         }
1240         while (num > 0) {
1241                 cache = btrfs_lookup_block_group(fs_info, bytenr);
1242                 if (!cache) {
1243                         u64 first = first_logical_byte(root, bytenr);
1244                         WARN_ON(first < bytenr);
1245                         len = min(first - bytenr, num);
1246                 } else {
1247                         len = min(num, cache->key.offset -
1248                                   (bytenr - cache->key.objectid));
1249                 }
1250                 if (pin) {
1251                         if (cache) {
1252                                 cache->pinned += len;
1253                                 cache->space_info->bytes_pinned += len;
1254                         }
1255                         fs_info->total_pinned += len;
1256                 } else {
1257                         if (cache) {
1258                                 cache->pinned -= len;
1259                                 cache->space_info->bytes_pinned -= len;
1260                         }
1261                         fs_info->total_pinned -= len;
1262                 }
1263                 bytenr += len;
1264                 num -= len;
1265         }
1266         return 0;
1267 }
1268
1269 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
1270 {
1271         u64 last = 0;
1272         u64 start;
1273         u64 end;
1274         struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
1275         int ret;
1276
1277         while(1) {
1278                 ret = find_first_extent_bit(pinned_extents, last,
1279                                             &start, &end, EXTENT_DIRTY);
1280                 if (ret)
1281                         break;
1282                 set_extent_dirty(copy, start, end, GFP_NOFS);
1283                 last = end + 1;
1284         }
1285         return 0;
1286 }
1287
1288 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1289                                struct btrfs_root *root,
1290                                struct extent_io_tree *unpin)
1291 {
1292         u64 start;
1293         u64 end;
1294         int ret;
1295         struct extent_io_tree *free_space_cache;
1296         free_space_cache = &root->fs_info->free_space_cache;
1297
1298         while(1) {
1299                 ret = find_first_extent_bit(unpin, 0, &start, &end,
1300                                             EXTENT_DIRTY);
1301                 if (ret)
1302                         break;
1303                 update_pinned_extents(root, start, end + 1 - start, 0);
1304                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
1305                 set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
1306         }
1307         return 0;
1308 }
1309
1310 static int finish_current_insert(struct btrfs_trans_handle *trans,
1311                                  struct btrfs_root *extent_root)
1312 {
1313         u64 start;
1314         u64 end;
1315         struct btrfs_fs_info *info = extent_root->fs_info;
1316         struct extent_buffer *eb;
1317         struct btrfs_path *path;
1318         struct btrfs_key ins;
1319         struct btrfs_disk_key first;
1320         struct btrfs_extent_item extent_item;
1321         int ret;
1322         int level;
1323         int err = 0;
1324
1325         btrfs_set_stack_extent_refs(&extent_item, 1);
1326         btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
1327         path = btrfs_alloc_path();
1328
1329         while(1) {
1330                 ret = find_first_extent_bit(&info->extent_ins, 0, &start,
1331                                             &end, EXTENT_LOCKED);
1332                 if (ret)
1333                         break;
1334
1335                 ins.objectid = start;
1336                 ins.offset = end + 1 - start;
1337                 err = btrfs_insert_item(trans, extent_root, &ins,
1338                                         &extent_item, sizeof(extent_item));
1339                 clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
1340                                   GFP_NOFS);
1341                 eb = read_tree_block(extent_root, ins.objectid, ins.offset);
1342                 level = btrfs_header_level(eb);
1343                 if (level == 0) {
1344                         btrfs_item_key(eb, &first, 0);
1345                 } else {
1346                         btrfs_node_key(eb, &first, 0);
1347                 }
1348                 err = btrfs_insert_extent_backref(trans, extent_root, path,
1349                                           start, extent_root->root_key.objectid,
1350                                           0, level,
1351                                           btrfs_disk_key_objectid(&first));
1352                 BUG_ON(err);
1353                 free_extent_buffer(eb);
1354         }
1355         btrfs_free_path(path);
1356         return 0;
1357 }
1358
1359 static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
1360                           int pending)
1361 {
1362         int err = 0;
1363         struct extent_buffer *buf;
1364
1365         if (!pending) {
1366                 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
1367                 if (buf) {
1368                         if (btrfs_buffer_uptodate(buf)) {
1369                                 u64 transid =
1370                                     root->fs_info->running_transaction->transid;
1371                                 u64 header_transid =
1372                                         btrfs_header_generation(buf);
1373                                 if (header_transid == transid &&
1374                                     !btrfs_header_flag(buf,
1375                                                BTRFS_HEADER_FLAG_WRITTEN)) {
1376                                         clean_tree_block(NULL, root, buf);
1377                                         free_extent_buffer(buf);
1378                                         return 1;
1379                                 }
1380                         }
1381                         free_extent_buffer(buf);
1382                 }
1383                 update_pinned_extents(root, bytenr, num_bytes, 1);
1384         } else {
1385                 set_extent_bits(&root->fs_info->pending_del,
1386                                 bytenr, bytenr + num_bytes - 1,
1387                                 EXTENT_LOCKED, GFP_NOFS);
1388         }
1389         BUG_ON(err < 0);
1390         return 0;
1391 }
1392
1393 /*
1394  * remove an extent from the root, returns 0 on success
1395  */
1396 static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
1397                          *root, u64 bytenr, u64 num_bytes,
1398                          u64 root_objectid, u64 ref_generation,
1399                          u64 owner_objectid, u64 owner_offset, int pin,
1400                          int mark_free)
1401 {
1402         struct btrfs_path *path;
1403         struct btrfs_key key;
1404         struct btrfs_fs_info *info = root->fs_info;
1405         struct btrfs_root *extent_root = info->extent_root;
1406         struct extent_buffer *leaf;
1407         int ret;
1408         int extent_slot = 0;
1409         int found_extent = 0;
1410         int num_to_del = 1;
1411         struct btrfs_extent_item *ei;
1412         u32 refs;
1413
1414         key.objectid = bytenr;
1415         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1416         key.offset = num_bytes;
1417         path = btrfs_alloc_path();
1418         if (!path)
1419                 return -ENOMEM;
1420
1421         path->reada = 1;
1422         ret = lookup_extent_backref(trans, extent_root, path,
1423                                     bytenr, root_objectid,
1424                                     ref_generation,
1425                                     owner_objectid, owner_offset, 1);
1426         if (ret == 0) {
1427                 struct btrfs_key found_key;
1428                 extent_slot = path->slots[0];
1429                 while(extent_slot > 0) {
1430                         extent_slot--;
1431                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1432                                               extent_slot);
1433                         if (found_key.objectid != bytenr)
1434                                 break;
1435                         if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
1436                             found_key.offset == num_bytes) {
1437                                 found_extent = 1;
1438                                 break;
1439                         }
1440                         if (path->slots[0] - extent_slot > 5)
1441                                 break;
1442                 }
1443                 if (!found_extent)
1444                         ret = btrfs_del_item(trans, extent_root, path);
1445         } else {
1446                 btrfs_print_leaf(extent_root, path->nodes[0]);
1447                 WARN_ON(1);
1448                 printk("Unable to find ref byte nr %Lu root %Lu "
1449                        " gen %Lu owner %Lu offset %Lu\n", bytenr,
1450                        root_objectid, ref_generation, owner_objectid,
1451                        owner_offset);
1452         }
1453         if (!found_extent) {
1454                 btrfs_release_path(extent_root, path);
1455                 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
1456                 if (ret < 0)
1457                         return ret;
1458                 BUG_ON(ret);
1459                 extent_slot = path->slots[0];
1460         }
1461
1462         leaf = path->nodes[0];
1463         ei = btrfs_item_ptr(leaf, extent_slot,
1464                             struct btrfs_extent_item);
1465         refs = btrfs_extent_refs(leaf, ei);
1466         BUG_ON(refs == 0);
1467         refs -= 1;
1468         btrfs_set_extent_refs(leaf, ei, refs);
1469
1470         btrfs_mark_buffer_dirty(leaf);
1471
1472         if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
1473                 /* if the back ref and the extent are next to each other
1474                  * they get deleted below in one shot
1475                  */
1476                 path->slots[0] = extent_slot;
1477                 num_to_del = 2;
1478         } else if (found_extent) {
1479                 /* otherwise delete the extent back ref */
1480                 ret = btrfs_del_item(trans, extent_root, path);
1481                 BUG_ON(ret);
1482                 /* if refs are 0, we need to setup the path for deletion */
1483                 if (refs == 0) {
1484                         btrfs_release_path(extent_root, path);
1485                         ret = btrfs_search_slot(trans, extent_root, &key, path,
1486                                                 -1, 1);
1487                         if (ret < 0)
1488                                 return ret;
1489                         BUG_ON(ret);
1490                 }
1491         }
1492
1493         if (refs == 0) {
1494                 u64 super_used;
1495                 u64 root_used;
1496
1497                 if (pin) {
1498                         ret = pin_down_bytes(root, bytenr, num_bytes, 0);
1499                         if (ret > 0)
1500                                 mark_free = 1;
1501                         BUG_ON(ret < 0);
1502                 }
1503
1504                 /* block accounting for super block */
1505                 super_used = btrfs_super_bytes_used(&info->super_copy);
1506                 btrfs_set_super_bytes_used(&info->super_copy,
1507                                            super_used - num_bytes);
1508
1509                 /* block accounting for root item */
1510                 root_used = btrfs_root_used(&root->root_item);
1511                 btrfs_set_root_used(&root->root_item,
1512                                            root_used - num_bytes);
1513                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
1514                                       num_to_del);
1515                 if (ret) {
1516                         return ret;
1517                 }
1518                 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
1519                                          mark_free);
1520                 BUG_ON(ret);
1521         }
1522         btrfs_free_path(path);
1523         finish_current_insert(trans, extent_root);
1524         return ret;
1525 }
1526
1527 /*
1528  * find all the blocks marked as pending in the radix tree and remove
1529  * them from the extent map
1530  */
1531 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
1532                                btrfs_root *extent_root)
1533 {
1534         int ret;
1535         int err = 0;
1536         u64 start;
1537         u64 end;
1538         struct extent_io_tree *pending_del;
1539         struct extent_io_tree *pinned_extents;
1540
1541         pending_del = &extent_root->fs_info->pending_del;
1542         pinned_extents = &extent_root->fs_info->pinned_extents;
1543
1544         while(1) {
1545                 ret = find_first_extent_bit(pending_del, 0, &start, &end,
1546                                             EXTENT_LOCKED);
1547                 if (ret)
1548                         break;
1549                 update_pinned_extents(extent_root, start, end + 1 - start, 1);
1550                 clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
1551                                   GFP_NOFS);
1552                 ret = __free_extent(trans, extent_root,
1553                                      start, end + 1 - start,
1554                                      extent_root->root_key.objectid,
1555                                      0, 0, 0, 0, 0);
1556                 if (ret)
1557                         err = ret;
1558         }
1559         return err;
1560 }
1561
1562 /*
1563  * remove an extent from the root, returns 0 on success
1564  */
1565 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
1566                       *root, u64 bytenr, u64 num_bytes,
1567                       u64 root_objectid, u64 ref_generation,
1568                       u64 owner_objectid, u64 owner_offset, int pin)
1569 {
1570         struct btrfs_root *extent_root = root->fs_info->extent_root;
1571         int pending_ret;
1572         int ret;
1573
1574         WARN_ON(num_bytes < root->sectorsize);
1575         if (!root->ref_cows)
1576                 ref_generation = 0;
1577
1578         if (root == extent_root) {
1579                 pin_down_bytes(root, bytenr, num_bytes, 1);
1580                 return 0;
1581         }
1582         ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
1583                             ref_generation, owner_objectid, owner_offset,
1584                             pin, pin == 0);
1585         pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
1586         return ret ? ret : pending_ret;
1587 }
1588
1589 static u64 stripe_align(struct btrfs_root *root, u64 val)
1590 {
1591         u64 mask = ((u64)root->stripesize - 1);
1592         u64 ret = (val + mask) & ~mask;
1593         return ret;
1594 }
1595
1596 /*
1597  * walks the btree of allocated extents and find a hole of a given size.
1598  * The key ins is changed to record the hole:
1599  * ins->objectid == block start
1600  * ins->flags = BTRFS_EXTENT_ITEM_KEY
1601  * ins->offset == number of blocks
1602  * Any available blocks before search_start are skipped.
1603  */
1604 static int noinline find_free_extent(struct btrfs_trans_handle *trans,
1605                                      struct btrfs_root *orig_root,
1606                                      u64 num_bytes, u64 empty_size,
1607                                      u64 search_start, u64 search_end,
1608                                      u64 hint_byte, struct btrfs_key *ins,
1609                                      u64 exclude_start, u64 exclude_nr,
1610                                      int data)
1611 {
1612         int ret;
1613         u64 orig_search_start;
1614         struct btrfs_root * root = orig_root->fs_info->extent_root;
1615         struct btrfs_fs_info *info = root->fs_info;
1616         u64 total_needed = num_bytes;
1617         u64 *last_ptr = NULL;
1618         struct btrfs_block_group_cache *block_group;
1619         int full_scan = 0;
1620         int wrapped = 0;
1621         int empty_cluster = 2 * 1024 * 1024;
1622
1623         WARN_ON(num_bytes < root->sectorsize);
1624         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
1625
1626         if (data & BTRFS_BLOCK_GROUP_METADATA) {
1627                 last_ptr = &root->fs_info->last_alloc;
1628                 empty_cluster = 256 * 1024;
1629         }
1630
1631         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
1632                 last_ptr = &root->fs_info->last_data_alloc;
1633         }
1634
1635         if (last_ptr) {
1636                 if (*last_ptr)
1637                         hint_byte = *last_ptr;
1638                 else {
1639                         empty_size += empty_cluster;
1640                 }
1641         }
1642
1643         search_start = max(search_start, first_logical_byte(root, 0));
1644         orig_search_start = search_start;
1645
1646         if (search_end == (u64)-1)
1647                 search_end = btrfs_super_total_bytes(&info->super_copy);
1648
1649         if (hint_byte) {
1650                 block_group = btrfs_lookup_block_group(info, hint_byte);
1651                 if (!block_group)
1652                         hint_byte = search_start;
1653                 block_group = btrfs_find_block_group(root, block_group,
1654                                                      hint_byte, data, 1);
1655                 if (last_ptr && *last_ptr == 0 && block_group)
1656                         hint_byte = block_group->key.objectid;
1657         } else {
1658                 block_group = btrfs_find_block_group(root,
1659                                                      trans->block_group,
1660                                                      search_start, data, 1);
1661         }
1662         search_start = max(search_start, hint_byte);
1663
1664         total_needed += empty_size;
1665
1666 check_failed:
1667         if (!block_group) {
1668                 block_group = btrfs_lookup_block_group(info, search_start);
1669                 if (!block_group)
1670                         block_group = btrfs_lookup_block_group(info,
1671                                                        orig_search_start);
1672         }
1673         ret = find_search_start(root, &block_group, &search_start,
1674                                 total_needed, data);
1675         if (ret == -ENOSPC && last_ptr && *last_ptr) {
1676                 *last_ptr = 0;
1677                 block_group = btrfs_lookup_block_group(info,
1678                                                        orig_search_start);
1679                 search_start = orig_search_start;
1680                 ret = find_search_start(root, &block_group, &search_start,
1681                                         total_needed, data);
1682         }
1683         if (ret == -ENOSPC)
1684                 goto enospc;
1685         if (ret)
1686                 goto error;
1687
1688         if (last_ptr && *last_ptr && search_start != *last_ptr) {
1689                 *last_ptr = 0;
1690                 if (!empty_size) {
1691                         empty_size += empty_cluster;
1692                         total_needed += empty_size;
1693                 }
1694                 block_group = btrfs_lookup_block_group(info,
1695                                                        orig_search_start);
1696                 search_start = orig_search_start;
1697                 ret = find_search_start(root, &block_group,
1698                                         &search_start, total_needed, data);
1699                 if (ret == -ENOSPC)
1700                         goto enospc;
1701                 if (ret)
1702                         goto error;
1703         }
1704
1705         search_start = stripe_align(root, search_start);
1706         ins->objectid = search_start;
1707         ins->offset = num_bytes;
1708
1709         if (ins->objectid + num_bytes >= search_end)
1710                 goto enospc;
1711
1712         if (ins->objectid + num_bytes >
1713             block_group->key.objectid + block_group->key.offset) {
1714                 search_start = block_group->key.objectid +
1715                         block_group->key.offset;
1716                 goto new_group;
1717         }
1718
1719         if (test_range_bit(&info->extent_ins, ins->objectid,
1720                            ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) {
1721                 search_start = ins->objectid + num_bytes;
1722                 goto new_group;
1723         }
1724
1725         if (test_range_bit(&info->pinned_extents, ins->objectid,
1726                            ins->objectid + num_bytes -1, EXTENT_DIRTY, 0)) {
1727                 search_start = ins->objectid + num_bytes;
1728                 goto new_group;
1729         }
1730
1731         if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
1732             ins->objectid < exclude_start + exclude_nr)) {
1733                 search_start = exclude_start + exclude_nr;
1734                 goto new_group;
1735         }
1736
1737         if (!(data & BTRFS_BLOCK_GROUP_DATA)) {
1738                 block_group = btrfs_lookup_block_group(info, ins->objectid);
1739                 if (block_group)
1740                         trans->block_group = block_group;
1741         }
1742         ins->offset = num_bytes;
1743         if (last_ptr) {
1744                 *last_ptr = ins->objectid + ins->offset;
1745                 if (*last_ptr ==
1746                     btrfs_super_total_bytes(&root->fs_info->super_copy)) {
1747                         *last_ptr = 0;
1748                 }
1749         }
1750         return 0;
1751
1752 new_group:
1753         if (search_start + num_bytes >= search_end) {
1754 enospc:
1755                 search_start = orig_search_start;
1756                 if (full_scan) {
1757                         ret = -ENOSPC;
1758                         goto error;
1759                 }
1760                 if (wrapped) {
1761                         if (!full_scan)
1762                                 total_needed -= empty_size;
1763                         full_scan = 1;
1764                 } else
1765                         wrapped = 1;
1766         }
1767         block_group = btrfs_lookup_block_group(info, search_start);
1768         cond_resched();
1769         block_group = btrfs_find_block_group(root, block_group,
1770                                              search_start, data, 0);
1771         goto check_failed;
1772
1773 error:
1774         return ret;
1775 }
1776
1777 /*
1778  * finds a free extent and does all the dirty work required for allocation
1779  * returns the key for the extent through ins, and a tree buffer for
1780  * the first block of the extent through buf.
1781  *
1782  * returns 0 if everything worked, non-zero otherwise.
1783  */
1784 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1785                        struct btrfs_root *root,
1786                        u64 num_bytes, u64 min_alloc_size,
1787                        u64 root_objectid, u64 ref_generation,
1788                        u64 owner, u64 owner_offset,
1789                        u64 empty_size, u64 hint_byte,
1790                        u64 search_end, struct btrfs_key *ins, u64 data)
1791 {
1792         int ret;
1793         int pending_ret;
1794         u64 super_used;
1795         u64 root_used;
1796         u64 search_start = 0;
1797         u64 alloc_profile;
1798         u32 sizes[2];
1799         struct btrfs_fs_info *info = root->fs_info;
1800         struct btrfs_root *extent_root = info->extent_root;
1801         struct btrfs_extent_item *extent_item;
1802         struct btrfs_extent_ref *ref;
1803         struct btrfs_path *path;
1804         struct btrfs_key keys[2];
1805
1806         if (data) {
1807                 alloc_profile = info->avail_data_alloc_bits &
1808                                 info->data_alloc_profile;
1809                 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
1810         } else if (root == root->fs_info->chunk_root) {
1811                 alloc_profile = info->avail_system_alloc_bits &
1812                                 info->system_alloc_profile;
1813                 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
1814         } else {
1815                 alloc_profile = info->avail_metadata_alloc_bits &
1816                                 info->metadata_alloc_profile;
1817                 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
1818         }
1819 again:
1820         data = reduce_alloc_profile(root, data);
1821         if (root->ref_cows) {
1822                 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
1823                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1824                                              2 * 1024 * 1024,
1825                                              BTRFS_BLOCK_GROUP_METADATA |
1826                                              (info->metadata_alloc_profile &
1827                                               info->avail_metadata_alloc_bits));
1828                         BUG_ON(ret);
1829                 }
1830                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1831                                      num_bytes + 2 * 1024 * 1024, data);
1832                 BUG_ON(ret);
1833         }
1834
1835         WARN_ON(num_bytes < root->sectorsize);
1836         ret = find_free_extent(trans, root, num_bytes, empty_size,
1837                                search_start, search_end, hint_byte, ins,
1838                                trans->alloc_exclude_start,
1839                                trans->alloc_exclude_nr, data);
1840
1841         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
1842                 num_bytes = num_bytes >> 1;
1843                 num_bytes = max(num_bytes, min_alloc_size);
1844                 goto again;
1845         }
1846         if (ret) {
1847                 printk("allocation failed flags %Lu\n", data);
1848         }
1849         BUG_ON(ret);
1850         if (ret)
1851                 return ret;
1852
1853         /* block accounting for super block */
1854         super_used = btrfs_super_bytes_used(&info->super_copy);
1855         btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
1856
1857         /* block accounting for root item */
1858         root_used = btrfs_root_used(&root->root_item);
1859         btrfs_set_root_used(&root->root_item, root_used + num_bytes);
1860
1861         clear_extent_dirty(&root->fs_info->free_space_cache,
1862                            ins->objectid, ins->objectid + ins->offset - 1,
1863                            GFP_NOFS);
1864
1865         if (root == extent_root) {
1866                 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
1867                                 ins->objectid + ins->offset - 1,
1868                                 EXTENT_LOCKED, GFP_NOFS);
1869                 goto update_block;
1870         }
1871
1872         WARN_ON(trans->alloc_exclude_nr);
1873         trans->alloc_exclude_start = ins->objectid;
1874         trans->alloc_exclude_nr = ins->offset;
1875
1876         memcpy(&keys[0], ins, sizeof(*ins));
1877         keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
1878                                          owner, owner_offset);
1879         keys[1].objectid = ins->objectid;
1880         keys[1].type = BTRFS_EXTENT_REF_KEY;
1881         sizes[0] = sizeof(*extent_item);
1882         sizes[1] = sizeof(*ref);
1883
1884         path = btrfs_alloc_path();
1885         BUG_ON(!path);
1886
1887         ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
1888                                        sizes, 2);
1889
1890         BUG_ON(ret);
1891         extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1892                                      struct btrfs_extent_item);
1893         btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
1894         ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
1895                              struct btrfs_extent_ref);
1896
1897         btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
1898         btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
1899         btrfs_set_ref_objectid(path->nodes[0], ref, owner);
1900         btrfs_set_ref_offset(path->nodes[0], ref, owner_offset);
1901
1902         btrfs_mark_buffer_dirty(path->nodes[0]);
1903
1904         trans->alloc_exclude_start = 0;
1905         trans->alloc_exclude_nr = 0;
1906         btrfs_free_path(path);
1907         finish_current_insert(trans, extent_root);
1908         pending_ret = del_pending_extents(trans, extent_root);
1909
1910         if (ret) {
1911                 return ret;
1912         }
1913         if (pending_ret) {
1914                 return pending_ret;
1915         }
1916
1917 update_block:
1918         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
1919         if (ret) {
1920                 printk("update block group failed for %Lu %Lu\n",
1921                        ins->objectid, ins->offset);
1922                 BUG();
1923         }
1924         return 0;
1925 }
1926
1927 /*
1928  * helper function to allocate a block for a given tree
1929  * returns the tree buffer or NULL.
1930  */
1931 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1932                                              struct btrfs_root *root,
1933                                              u32 blocksize,
1934                                              u64 root_objectid, u64 hint,
1935                                              u64 empty_size)
1936 {
1937         u64 ref_generation;
1938
1939         if (root->ref_cows)
1940                 ref_generation = trans->transid;
1941         else
1942                 ref_generation = 0;
1943
1944
1945         return __btrfs_alloc_free_block(trans, root, blocksize, root_objectid,
1946                                         ref_generation, 0, 0, hint, empty_size);
1947 }
1948
1949 /*
1950  * helper function to allocate a block for a given tree
1951  * returns the tree buffer or NULL.
1952  */
1953 struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1954                                              struct btrfs_root *root,
1955                                              u32 blocksize,
1956                                              u64 root_objectid,
1957                                              u64 ref_generation,
1958                                              u64 first_objectid,
1959                                              int level,
1960                                              u64 hint,
1961                                              u64 empty_size)
1962 {
1963         struct btrfs_key ins;
1964         int ret;
1965         struct extent_buffer *buf;
1966
1967         ret = btrfs_alloc_extent(trans, root, blocksize, blocksize,
1968                                  root_objectid, ref_generation,
1969                                  level, first_objectid, empty_size, hint,
1970                                  (u64)-1, &ins, 0);
1971         if (ret) {
1972                 BUG_ON(ret > 0);
1973                 return ERR_PTR(ret);
1974         }
1975         buf = btrfs_find_create_tree_block(root, ins.objectid, blocksize);
1976         if (!buf) {
1977                 btrfs_free_extent(trans, root, ins.objectid, blocksize,
1978                                   root->root_key.objectid, ref_generation,
1979                                   0, 0, 0);
1980                 return ERR_PTR(-ENOMEM);
1981         }
1982         btrfs_set_header_generation(buf, trans->transid);
1983         clean_tree_block(trans, root, buf);
1984         btrfs_set_buffer_uptodate(buf);
1985
1986         if (PageDirty(buf->first_page)) {
1987                 printk("page %lu dirty\n", buf->first_page->index);
1988                 WARN_ON(1);
1989         }
1990
1991         set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
1992                          buf->start + buf->len - 1, GFP_NOFS);
1993         if (!btrfs_test_opt(root, SSD))
1994                 btrfs_set_buffer_defrag(buf);
1995         trans->blocks_used++;
1996         return buf;
1997 }
1998
1999 static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
2000                                   struct btrfs_root *root,
2001                                   struct extent_buffer *leaf)
2002 {
2003         u64 leaf_owner;
2004         u64 leaf_generation;
2005         struct btrfs_key key;
2006         struct btrfs_file_extent_item *fi;
2007         int i;
2008         int nritems;
2009         int ret;
2010
2011         BUG_ON(!btrfs_is_leaf(leaf));
2012         nritems = btrfs_header_nritems(leaf);
2013         leaf_owner = btrfs_header_owner(leaf);
2014         leaf_generation = btrfs_header_generation(leaf);
2015
2016         for (i = 0; i < nritems; i++) {
2017                 u64 disk_bytenr;
2018
2019                 btrfs_item_key_to_cpu(leaf, &key, i);
2020                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2021                         continue;
2022                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2023                 if (btrfs_file_extent_type(leaf, fi) ==
2024                     BTRFS_FILE_EXTENT_INLINE)
2025                         continue;
2026                 /*
2027                  * FIXME make sure to insert a trans record that
2028                  * repeats the snapshot del on crash
2029                  */
2030                 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2031                 if (disk_bytenr == 0)
2032                         continue;
2033                 ret = btrfs_free_extent(trans, root, disk_bytenr,
2034                                 btrfs_file_extent_disk_num_bytes(leaf, fi),
2035                                 leaf_owner, leaf_generation,
2036                                 key.objectid, key.offset, 0);
2037                 BUG_ON(ret);
2038         }
2039         return 0;
2040 }
2041
2042 static void noinline reada_walk_down(struct btrfs_root *root,
2043                                      struct extent_buffer *node,
2044                                      int slot)
2045 {
2046         u64 bytenr;
2047         u64 last = 0;
2048         u32 nritems;
2049         u32 refs;
2050         u32 blocksize;
2051         int ret;
2052         int i;
2053         int level;
2054         int skipped = 0;
2055
2056         nritems = btrfs_header_nritems(node);
2057         level = btrfs_header_level(node);
2058         if (level)
2059                 return;
2060
2061         for (i = slot; i < nritems && skipped < 32; i++) {
2062                 bytenr = btrfs_node_blockptr(node, i);
2063                 if (last && ((bytenr > last && bytenr - last > 32 * 1024) ||
2064                              (last > bytenr && last - bytenr > 32 * 1024))) {
2065                         skipped++;
2066                         continue;
2067                 }
2068                 blocksize = btrfs_level_size(root, level - 1);
2069                 if (i != slot) {
2070                         ret = lookup_extent_ref(NULL, root, bytenr,
2071                                                 blocksize, &refs);
2072                         BUG_ON(ret);
2073                         if (refs != 1) {
2074                                 skipped++;
2075                                 continue;
2076                         }
2077                 }
2078                 mutex_unlock(&root->fs_info->fs_mutex);
2079                 ret = readahead_tree_block(root, bytenr, blocksize);
2080                 last = bytenr + blocksize;
2081                 cond_resched();
2082                 mutex_lock(&root->fs_info->fs_mutex);
2083                 if (ret)
2084                         break;
2085         }
2086 }
2087
2088 /*
2089  * helper function for drop_snapshot, this walks down the tree dropping ref
2090  * counts as it goes.
2091  */
2092 static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
2093                                    struct btrfs_root *root,
2094                                    struct btrfs_path *path, int *level)
2095 {
2096         u64 root_owner;
2097         u64 root_gen;
2098         u64 bytenr;
2099         struct extent_buffer *next;
2100         struct extent_buffer *cur;
2101         struct extent_buffer *parent;
2102         u32 blocksize;
2103         int ret;
2104         u32 refs;
2105
2106         WARN_ON(*level < 0);
2107         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2108         ret = lookup_extent_ref(trans, root,
2109                                 path->nodes[*level]->start,
2110                                 path->nodes[*level]->len, &refs);
2111         BUG_ON(ret);
2112         if (refs > 1)
2113                 goto out;
2114
2115         /*
2116          * walk down to the last node level and free all the leaves
2117          */
2118         while(*level >= 0) {
2119                 WARN_ON(*level < 0);
2120                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2121                 cur = path->nodes[*level];
2122
2123                 if (btrfs_header_level(cur) != *level)
2124                         WARN_ON(1);
2125
2126                 if (path->slots[*level] >=
2127                     btrfs_header_nritems(cur))
2128                         break;
2129                 if (*level == 0) {
2130                         ret = drop_leaf_ref(trans, root, cur);
2131                         BUG_ON(ret);
2132                         break;
2133                 }
2134                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2135                 blocksize = btrfs_level_size(root, *level - 1);
2136                 ret = lookup_extent_ref(trans, root, bytenr, blocksize, &refs);
2137                 BUG_ON(ret);
2138                 if (refs != 1) {
2139                         parent = path->nodes[*level];
2140                         root_owner = btrfs_header_owner(parent);
2141                         root_gen = btrfs_header_generation(parent);
2142                         path->slots[*level]++;
2143                         ret = btrfs_free_extent(trans, root, bytenr,
2144                                                 blocksize, root_owner,
2145                                                 root_gen, 0, 0, 1);
2146                         BUG_ON(ret);
2147                         continue;
2148                 }
2149                 next = btrfs_find_tree_block(root, bytenr, blocksize);
2150                 if (!next || !btrfs_buffer_uptodate(next)) {
2151                         free_extent_buffer(next);
2152                         reada_walk_down(root, cur, path->slots[*level]);
2153
2154                         mutex_unlock(&root->fs_info->fs_mutex);
2155                         next = read_tree_block(root, bytenr, blocksize);
2156                         mutex_lock(&root->fs_info->fs_mutex);
2157
2158                         /* we've dropped the lock, double check */
2159                         ret = lookup_extent_ref(trans, root, bytenr,
2160                                                 blocksize, &refs);
2161                         BUG_ON(ret);
2162                         if (refs != 1) {
2163                                 parent = path->nodes[*level];
2164                                 root_owner = btrfs_header_owner(parent);
2165                                 root_gen = btrfs_header_generation(parent);
2166
2167                                 path->slots[*level]++;
2168                                 free_extent_buffer(next);
2169                                 ret = btrfs_free_extent(trans, root, bytenr,
2170                                                         blocksize,
2171                                                         root_owner,
2172                                                         root_gen, 0, 0, 1);
2173                                 BUG_ON(ret);
2174                                 continue;
2175                         }
2176                 } else if (next) {
2177                         btrfs_verify_block_csum(root, next);
2178                 }
2179                 WARN_ON(*level <= 0);
2180                 if (path->nodes[*level-1])
2181                         free_extent_buffer(path->nodes[*level-1]);
2182                 path->nodes[*level-1] = next;
2183                 *level = btrfs_header_level(next);
2184                 path->slots[*level] = 0;
2185         }
2186 out:
2187         WARN_ON(*level < 0);
2188         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2189
2190         if (path->nodes[*level] == root->node) {
2191                 root_owner = root->root_key.objectid;
2192                 parent = path->nodes[*level];
2193         } else {
2194                 parent = path->nodes[*level + 1];
2195                 root_owner = btrfs_header_owner(parent);
2196         }
2197
2198         root_gen = btrfs_header_generation(parent);
2199         ret = btrfs_free_extent(trans, root, path->nodes[*level]->start,
2200                                 path->nodes[*level]->len,
2201                                 root_owner, root_gen, 0, 0, 1);
2202         free_extent_buffer(path->nodes[*level]);
2203         path->nodes[*level] = NULL;
2204         *level += 1;
2205         BUG_ON(ret);
2206         return 0;
2207 }
2208
2209 /*
2210  * helper for dropping snapshots.  This walks back up the tree in the path
2211  * to find the first node higher up where we haven't yet gone through
2212  * all the slots
2213  */
2214 static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
2215                                  struct btrfs_root *root,
2216                                  struct btrfs_path *path, int *level)
2217 {
2218         u64 root_owner;
2219         u64 root_gen;
2220         struct btrfs_root_item *root_item = &root->root_item;
2221         int i;
2222         int slot;
2223         int ret;
2224
2225         for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2226                 slot = path->slots[i];
2227                 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
2228                         struct extent_buffer *node;
2229                         struct btrfs_disk_key disk_key;
2230                         node = path->nodes[i];
2231                         path->slots[i]++;
2232                         *level = i;
2233                         WARN_ON(*level == 0);
2234                         btrfs_node_key(node, &disk_key, path->slots[i]);
2235                         memcpy(&root_item->drop_progress,
2236                                &disk_key, sizeof(disk_key));
2237                         root_item->drop_level = i;
2238                         return 0;
2239                 } else {
2240                         if (path->nodes[*level] == root->node) {
2241                                 root_owner = root->root_key.objectid;
2242                                 root_gen =
2243                                    btrfs_header_generation(path->nodes[*level]);
2244                         } else {
2245                                 struct extent_buffer *node;
2246                                 node = path->nodes[*level + 1];
2247                                 root_owner = btrfs_header_owner(node);
2248                                 root_gen = btrfs_header_generation(node);
2249                         }
2250                         ret = btrfs_free_extent(trans, root,
2251                                                 path->nodes[*level]->start,
2252                                                 path->nodes[*level]->len,
2253                                                 root_owner, root_gen, 0, 0, 1);
2254                         BUG_ON(ret);
2255                         free_extent_buffer(path->nodes[*level]);
2256                         path->nodes[*level] = NULL;
2257                         *level = i + 1;
2258                 }
2259         }
2260         return 1;
2261 }
2262
2263 /*
2264  * drop the reference count on the tree rooted at 'snap'.  This traverses
2265  * the tree freeing any blocks that have a ref count of zero after being
2266  * decremented.
2267  */
2268 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
2269                         *root)
2270 {
2271         int ret = 0;
2272         int wret;
2273         int level;
2274         struct btrfs_path *path;
2275         int i;
2276         int orig_level;
2277         struct btrfs_root_item *root_item = &root->root_item;
2278
2279         path = btrfs_alloc_path();
2280         BUG_ON(!path);
2281
2282         level = btrfs_header_level(root->node);
2283         orig_level = level;
2284         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2285                 path->nodes[level] = root->node;
2286                 extent_buffer_get(root->node);
2287                 path->slots[level] = 0;
2288         } else {
2289                 struct btrfs_key key;
2290                 struct btrfs_disk_key found_key;
2291                 struct extent_buffer *node;
2292
2293                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2294                 level = root_item->drop_level;
2295                 path->lowest_level = level;
2296                 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2297                 if (wret < 0) {
2298                         ret = wret;
2299                         goto out;
2300                 }
2301                 node = path->nodes[level];
2302                 btrfs_node_key(node, &found_key, path->slots[level]);
2303                 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
2304                                sizeof(found_key)));
2305         }
2306         while(1) {
2307                 wret = walk_down_tree(trans, root, path, &level);
2308                 if (wret > 0)
2309                         break;
2310                 if (wret < 0)
2311                         ret = wret;
2312
2313                 wret = walk_up_tree(trans, root, path, &level);
2314                 if (wret > 0)
2315                         break;
2316                 if (wret < 0)
2317                         ret = wret;
2318                 ret = -EAGAIN;
2319                 break;
2320         }
2321         for (i = 0; i <= orig_level; i++) {
2322                 if (path->nodes[i]) {
2323                         free_extent_buffer(path->nodes[i]);
2324                         path->nodes[i] = NULL;
2325                 }
2326         }
2327 out:
2328         btrfs_free_path(path);
2329         return ret;
2330 }
2331
2332 int btrfs_free_block_groups(struct btrfs_fs_info *info)
2333 {
2334         u64 start;
2335         u64 end;
2336         u64 ptr;
2337         int ret;
2338         while(1) {
2339                 ret = find_first_extent_bit(&info->block_group_cache, 0,
2340                                             &start, &end, (unsigned int)-1);
2341                 if (ret)
2342                         break;
2343                 ret = get_state_private(&info->block_group_cache, start, &ptr);
2344                 if (!ret)
2345                         kfree((void *)(unsigned long)ptr);
2346                 clear_extent_bits(&info->block_group_cache, start,
2347                                   end, (unsigned int)-1, GFP_NOFS);
2348         }
2349         while(1) {
2350                 ret = find_first_extent_bit(&info->free_space_cache, 0,
2351                                             &start, &end, EXTENT_DIRTY);
2352                 if (ret)
2353                         break;
2354                 clear_extent_dirty(&info->free_space_cache, start,
2355                                    end, GFP_NOFS);
2356         }
2357         return 0;
2358 }
2359
2360 static unsigned long calc_ra(unsigned long start, unsigned long last,
2361                              unsigned long nr)
2362 {
2363         return min(last, start + nr - 1);
2364 }
2365
2366 static int noinline relocate_inode_pages(struct inode *inode, u64 start,
2367                                          u64 len)
2368 {
2369         u64 page_start;
2370         u64 page_end;
2371         unsigned long last_index;
2372         unsigned long i;
2373         struct page *page;
2374         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2375         struct file_ra_state *ra;
2376         unsigned long total_read = 0;
2377         unsigned long ra_pages;
2378         struct btrfs_trans_handle *trans;
2379
2380         ra = kzalloc(sizeof(*ra), GFP_NOFS);
2381
2382         mutex_lock(&inode->i_mutex);
2383         i = start >> PAGE_CACHE_SHIFT;
2384         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
2385
2386         ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages;
2387
2388         file_ra_state_init(ra, inode->i_mapping);
2389
2390         for (; i <= last_index; i++) {
2391                 if (total_read % ra_pages == 0) {
2392                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
2393                                        calc_ra(i, last_index, ra_pages));
2394                 }
2395                 total_read++;
2396                 if (((u64)i << PAGE_CACHE_SHIFT) > inode->i_size)
2397                         goto truncate_racing;
2398
2399                 page = grab_cache_page(inode->i_mapping, i);
2400                 if (!page) {
2401                         goto out_unlock;
2402                 }
2403                 if (!PageUptodate(page)) {
2404                         btrfs_readpage(NULL, page);
2405                         lock_page(page);
2406                         if (!PageUptodate(page)) {
2407                                 unlock_page(page);
2408                                 page_cache_release(page);
2409                                 goto out_unlock;
2410                         }
2411                 }
2412 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
2413                 ClearPageDirty(page);
2414 #else
2415                 cancel_dirty_page(page, PAGE_CACHE_SIZE);
2416 #endif
2417                 wait_on_page_writeback(page);
2418                 set_page_extent_mapped(page);
2419                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2420                 page_end = page_start + PAGE_CACHE_SIZE - 1;
2421
2422                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2423
2424                 set_extent_delalloc(io_tree, page_start,
2425                                     page_end, GFP_NOFS);
2426                 set_page_dirty(page);
2427
2428                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2429                 unlock_page(page);
2430                 page_cache_release(page);
2431         }
2432         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2433                                            total_read);
2434
2435 out_unlock:
2436         kfree(ra);
2437         trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
2438         if (trans) {
2439                 btrfs_add_ordered_inode(inode);
2440                 btrfs_end_transaction(trans, BTRFS_I(inode)->root);
2441                 mark_inode_dirty(inode);
2442         }
2443         mutex_unlock(&inode->i_mutex);
2444         return 0;
2445
2446 truncate_racing:
2447         vmtruncate(inode, inode->i_size);
2448         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2449                                            total_read);
2450         goto out_unlock;
2451 }
2452
2453 /*
2454  * The back references tell us which tree holds a ref on a block,
2455  * but it is possible for the tree root field in the reference to
2456  * reflect the original root before a snapshot was made.  In this
2457  * case we should search through all the children of a given root
2458  * to find potential holders of references on a block.
2459  *
2460  * Instead, we do something a little less fancy and just search
2461  * all the roots for a given key/block combination.
2462  */
2463 static int find_root_for_ref(struct btrfs_root *root,
2464                              struct btrfs_path *path,
2465                              struct btrfs_key *key0,
2466                              int level,
2467                              int file_key,
2468                              struct btrfs_root **found_root,
2469                              u64 bytenr)
2470 {
2471         struct btrfs_key root_location;
2472         struct btrfs_root *cur_root = *found_root;
2473         struct btrfs_file_extent_item *file_extent;
2474         u64 root_search_start = BTRFS_FS_TREE_OBJECTID;
2475         u64 found_bytenr;
2476         int ret;
2477         int i;
2478
2479         root_location.offset = (u64)-1;
2480         root_location.type = BTRFS_ROOT_ITEM_KEY;
2481         path->lowest_level = level;
2482         path->reada = 0;
2483         while(1) {
2484                 ret = btrfs_search_slot(NULL, cur_root, key0, path, 0, 0);
2485                 found_bytenr = 0;
2486                 if (ret == 0 && file_key) {
2487                         struct extent_buffer *leaf = path->nodes[0];
2488                         file_extent = btrfs_item_ptr(leaf, path->slots[0],
2489                                              struct btrfs_file_extent_item);
2490                         if (btrfs_file_extent_type(leaf, file_extent) ==
2491                             BTRFS_FILE_EXTENT_REG) {
2492                                 found_bytenr =
2493                                         btrfs_file_extent_disk_bytenr(leaf,
2494                                                                file_extent);
2495                        }
2496                 } else if (!file_key) {
2497                         if (path->nodes[level])
2498                                 found_bytenr = path->nodes[level]->start;
2499                 }
2500
2501                 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2502                         if (!path->nodes[i])
2503                                 break;
2504                         free_extent_buffer(path->nodes[i]);
2505                         path->nodes[i] = NULL;
2506                 }
2507                 btrfs_release_path(cur_root, path);
2508
2509                 if (found_bytenr == bytenr) {
2510                         *found_root = cur_root;
2511                         ret = 0;
2512                         goto out;
2513                 }
2514                 ret = btrfs_search_root(root->fs_info->tree_root,
2515                                         root_search_start, &root_search_start);
2516                 if (ret)
2517                         break;
2518
2519                 root_location.objectid = root_search_start;
2520                 cur_root = btrfs_read_fs_root_no_name(root->fs_info,
2521                                                       &root_location);
2522                 if (!cur_root) {
2523                         ret = 1;
2524                         break;
2525                 }
2526         }
2527 out:
2528         path->lowest_level = 0;
2529         return ret;
2530 }
2531
2532 /*
2533  * note, this releases the path
2534  */
2535 static int noinline relocate_one_reference(struct btrfs_root *extent_root,
2536                                   struct btrfs_path *path,
2537                                   struct btrfs_key *extent_key)
2538 {
2539         struct inode *inode;
2540         struct btrfs_root *found_root;
2541         struct btrfs_key root_location;
2542         struct btrfs_key found_key;
2543         struct btrfs_extent_ref *ref;
2544         u64 ref_root;
2545         u64 ref_gen;
2546         u64 ref_objectid;
2547         u64 ref_offset;
2548         int ret;
2549         int level;
2550
2551         ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
2552                              struct btrfs_extent_ref);
2553         ref_root = btrfs_ref_root(path->nodes[0], ref);
2554         ref_gen = btrfs_ref_generation(path->nodes[0], ref);
2555         ref_objectid = btrfs_ref_objectid(path->nodes[0], ref);
2556         ref_offset = btrfs_ref_offset(path->nodes[0], ref);
2557         btrfs_release_path(extent_root, path);
2558
2559         root_location.objectid = ref_root;
2560         if (ref_gen == 0)
2561                 root_location.offset = 0;
2562         else
2563                 root_location.offset = (u64)-1;
2564         root_location.type = BTRFS_ROOT_ITEM_KEY;
2565
2566         found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
2567                                                 &root_location);
2568         BUG_ON(!found_root);
2569
2570         if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2571                 found_key.objectid = ref_objectid;
2572                 found_key.type = BTRFS_EXTENT_DATA_KEY;
2573                 found_key.offset = ref_offset;
2574                 level = 0;
2575
2576                 ret = find_root_for_ref(extent_root, path, &found_key,
2577                                         level, 1, &found_root,
2578                                         extent_key->objectid);
2579
2580                 if (ret)
2581                         goto out;
2582
2583                 mutex_unlock(&extent_root->fs_info->fs_mutex);
2584                 inode = btrfs_iget_locked(extent_root->fs_info->sb,
2585                                           ref_objectid, found_root);
2586                 if (inode->i_state & I_NEW) {
2587                         /* the inode and parent dir are two different roots */
2588                         BTRFS_I(inode)->root = found_root;
2589                         BTRFS_I(inode)->location.objectid = ref_objectid;
2590                         BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
2591                         BTRFS_I(inode)->location.offset = 0;
2592                         btrfs_read_locked_inode(inode);
2593                         unlock_new_inode(inode);
2594
2595                 }
2596                 /* this can happen if the reference is not against
2597                  * the latest version of the tree root
2598                  */
2599                 if (is_bad_inode(inode)) {
2600                         mutex_lock(&extent_root->fs_info->fs_mutex);
2601                         goto out;
2602                 }
2603                 relocate_inode_pages(inode, ref_offset, extent_key->offset);
2604                 iput(inode);
2605                 mutex_lock(&extent_root->fs_info->fs_mutex);
2606         } else {
2607                 struct btrfs_trans_handle *trans;
2608                 struct extent_buffer *eb;
2609                 int i;
2610
2611                 eb = read_tree_block(found_root, extent_key->objectid,
2612                                      extent_key->offset);
2613                 level = btrfs_header_level(eb);
2614
2615                 if (level == 0)
2616                         btrfs_item_key_to_cpu(eb, &found_key, 0);
2617                 else
2618                         btrfs_node_key_to_cpu(eb, &found_key, 0);
2619
2620                 free_extent_buffer(eb);
2621
2622                 ret = find_root_for_ref(extent_root, path, &found_key,
2623                                         level, 0, &found_root,
2624                                         extent_key->objectid);
2625
2626                 if (ret)
2627                         goto out;
2628
2629                 trans = btrfs_start_transaction(found_root, 1);
2630
2631                 path->lowest_level = level;
2632                 path->reada = 2;
2633                 ret = btrfs_search_slot(trans, found_root, &found_key, path,
2634                                         0, 1);
2635                 path->lowest_level = 0;
2636                 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2637                         if (!path->nodes[i])
2638                                 break;
2639                         free_extent_buffer(path->nodes[i]);
2640                         path->nodes[i] = NULL;
2641                 }
2642                 btrfs_release_path(found_root, path);
2643                 btrfs_end_transaction(trans, found_root);
2644         }
2645
2646 out:
2647         return 0;
2648 }
2649
2650 static int noinline del_extent_zero(struct btrfs_root *extent_root,
2651                                     struct btrfs_path *path,
2652                                     struct btrfs_key *extent_key)
2653 {
2654         int ret;
2655         struct btrfs_trans_handle *trans;
2656
2657         trans = btrfs_start_transaction(extent_root, 1);
2658         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
2659         if (ret > 0) {
2660                 ret = -EIO;
2661                 goto out;
2662         }
2663         if (ret < 0)
2664                 goto out;
2665         ret = btrfs_del_item(trans, extent_root, path);
2666 out:
2667         btrfs_end_transaction(trans, extent_root);
2668         return ret;
2669 }
2670
2671 static int noinline relocate_one_extent(struct btrfs_root *extent_root,
2672                                         struct btrfs_path *path,
2673                                         struct btrfs_key *extent_key)
2674 {
2675         struct btrfs_key key;
2676         struct btrfs_key found_key;
2677         struct extent_buffer *leaf;
2678         u32 nritems;
2679         u32 item_size;
2680         int ret = 0;
2681
2682         if (extent_key->objectid == 0) {
2683                 ret = del_extent_zero(extent_root, path, extent_key);
2684                 goto out;
2685         }
2686         key.objectid = extent_key->objectid;
2687         key.type = BTRFS_EXTENT_REF_KEY;
2688         key.offset = 0;
2689
2690         while(1) {
2691                 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2692
2693                 if (ret < 0)
2694                         goto out;
2695
2696                 ret = 0;
2697                 leaf = path->nodes[0];
2698                 nritems = btrfs_header_nritems(leaf);
2699                 if (path->slots[0] == nritems) {
2700                         ret = btrfs_next_leaf(extent_root, path);
2701                         if (ret > 0) {
2702                                 ret = 0;
2703                                 goto out;
2704                         }
2705                         if (ret < 0)
2706                                 goto out;
2707                         leaf = path->nodes[0];
2708                 }
2709
2710                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2711                 if (found_key.objectid != extent_key->objectid) {
2712                         break;
2713                 }
2714
2715                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
2716                         break;
2717                 }
2718
2719                 key.offset = found_key.offset + 1;
2720                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2721
2722                 ret = relocate_one_reference(extent_root, path, extent_key);
2723                 if (ret)
2724                         goto out;
2725         }
2726         ret = 0;
2727 out:
2728         btrfs_release_path(extent_root, path);
2729         return ret;
2730 }
2731
2732 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
2733 {
2734         u64 num_devices;
2735         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
2736                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
2737
2738         num_devices = root->fs_info->fs_devices->num_devices;
2739         if (num_devices == 1) {
2740                 stripped |= BTRFS_BLOCK_GROUP_DUP;
2741                 stripped = flags & ~stripped;
2742
2743                 /* turn raid0 into single device chunks */
2744                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
2745                         return stripped;
2746
2747                 /* turn mirroring into duplication */
2748                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
2749                              BTRFS_BLOCK_GROUP_RAID10))
2750                         return stripped | BTRFS_BLOCK_GROUP_DUP;
2751                 return flags;
2752         } else {
2753                 /* they already had raid on here, just return */
2754                 if (flags & stripped)
2755                         return flags;
2756
2757                 stripped |= BTRFS_BLOCK_GROUP_DUP;
2758                 stripped = flags & ~stripped;
2759
2760                 /* switch duplicated blocks with raid1 */
2761                 if (flags & BTRFS_BLOCK_GROUP_DUP)
2762                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
2763
2764                 /* turn single device chunks into raid0 */
2765                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
2766         }
2767         return flags;
2768 }
2769
2770 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
2771 {
2772         struct btrfs_trans_handle *trans;
2773         struct btrfs_root *tree_root = root->fs_info->tree_root;
2774         struct btrfs_path *path;
2775         u64 cur_byte;
2776         u64 total_found;
2777         u64 shrink_last_byte;
2778         u64 new_alloc_flags;
2779         struct btrfs_block_group_cache *shrink_block_group;
2780         struct btrfs_fs_info *info = root->fs_info;
2781         struct btrfs_key key;
2782         struct btrfs_key found_key;
2783         struct extent_buffer *leaf;
2784         u32 nritems;
2785         int ret;
2786         int progress;
2787
2788         shrink_block_group = btrfs_lookup_block_group(root->fs_info,
2789                                                       shrink_start);
2790         BUG_ON(!shrink_block_group);
2791
2792         shrink_last_byte = shrink_start + shrink_block_group->key.offset;
2793
2794         shrink_block_group->space_info->total_bytes -=
2795                 shrink_block_group->key.offset;
2796         path = btrfs_alloc_path();
2797         root = root->fs_info->extent_root;
2798         path->reada = 2;
2799
2800         printk("btrfs relocating block group %llu flags %llu\n",
2801                (unsigned long long)shrink_start,
2802                (unsigned long long)shrink_block_group->flags);
2803
2804 again:
2805         if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
2806                 u64 calc;
2807
2808                 trans = btrfs_start_transaction(root, 1);
2809                 new_alloc_flags = update_block_group_flags(root,
2810                                                    shrink_block_group->flags);
2811                 if (new_alloc_flags != shrink_block_group->flags) {
2812                         calc =
2813                              btrfs_block_group_used(&shrink_block_group->item);
2814                 } else {
2815                         calc = shrink_block_group->key.offset;
2816                 }
2817                 do_chunk_alloc(trans, root->fs_info->extent_root,
2818                                calc + 2 * 1024 * 1024, new_alloc_flags);
2819                 btrfs_end_transaction(trans, root);
2820         }
2821         shrink_block_group->ro = 1;
2822
2823         total_found = 0;
2824         progress = 0;
2825         key.objectid = shrink_start;
2826         key.offset = 0;
2827         key.type = 0;
2828         cur_byte = key.objectid;
2829
2830         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2831         if (ret < 0)
2832                 goto out;
2833
2834         ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
2835         if (ret < 0)
2836                 goto out;
2837
2838         if (ret == 0) {
2839                 leaf = path->nodes[0];
2840                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2841                 if (found_key.objectid + found_key.offset > shrink_start &&
2842                     found_key.objectid < shrink_last_byte) {
2843                         cur_byte = found_key.objectid;
2844                         key.objectid = cur_byte;
2845                 }
2846         }
2847         btrfs_release_path(root, path);
2848
2849         while(1) {
2850                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2851                 if (ret < 0)
2852                         goto out;
2853
2854                 leaf = path->nodes[0];
2855                 nritems = btrfs_header_nritems(leaf);
2856 next:
2857                 if (path->slots[0] >= nritems) {
2858                         ret = btrfs_next_leaf(root, path);
2859                         if (ret < 0)
2860                                 goto out;
2861                         if (ret == 1) {
2862                                 ret = 0;
2863                                 break;
2864                         }
2865                         leaf = path->nodes[0];
2866                         nritems = btrfs_header_nritems(leaf);
2867                 }
2868
2869                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2870
2871                 if (found_key.objectid >= shrink_last_byte)
2872                         break;
2873
2874                 if (progress && need_resched()) {
2875                         memcpy(&key, &found_key, sizeof(key));
2876                         mutex_unlock(&root->fs_info->fs_mutex);
2877                         cond_resched();
2878                         mutex_lock(&root->fs_info->fs_mutex);
2879                         btrfs_release_path(root, path);
2880                         btrfs_search_slot(NULL, root, &key, path, 0, 0);
2881                         progress = 0;
2882                         goto next;
2883                 }
2884                 progress = 1;
2885
2886                 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY ||
2887                     found_key.objectid + found_key.offset <= cur_byte) {
2888                         path->slots[0]++;
2889                         goto next;
2890                 }
2891
2892                 total_found++;
2893                 cur_byte = found_key.objectid + found_key.offset;
2894                 key.objectid = cur_byte;
2895                 btrfs_release_path(root, path);
2896                 ret = relocate_one_extent(root, path, &found_key);
2897         }
2898
2899         btrfs_release_path(root, path);
2900
2901         if (total_found > 0) {
2902                 printk("btrfs relocate found %llu last extent was %llu\n",
2903                        (unsigned long long)total_found,
2904                        (unsigned long long)found_key.objectid);
2905                 trans = btrfs_start_transaction(tree_root, 1);
2906                 btrfs_commit_transaction(trans, tree_root);
2907
2908                 mutex_unlock(&root->fs_info->fs_mutex);
2909                 btrfs_clean_old_snapshots(tree_root);
2910                 mutex_lock(&root->fs_info->fs_mutex);
2911
2912                 trans = btrfs_start_transaction(tree_root, 1);
2913                 btrfs_commit_transaction(trans, tree_root);
2914                 goto again;
2915         }
2916
2917         /*
2918          * we've freed all the extents, now remove the block
2919          * group item from the tree
2920          */
2921         trans = btrfs_start_transaction(root, 1);
2922         memcpy(&key, &shrink_block_group->key, sizeof(key));
2923
2924         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2925         if (ret > 0)
2926                 ret = -EIO;
2927         if (ret < 0)
2928                 goto out;
2929
2930         leaf = path->nodes[0];
2931         nritems = btrfs_header_nritems(leaf);
2932         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2933         kfree(shrink_block_group);
2934
2935         clear_extent_bits(&info->block_group_cache, found_key.objectid,
2936                           found_key.objectid + found_key.offset - 1,
2937                           (unsigned int)-1, GFP_NOFS);
2938
2939         btrfs_del_item(trans, root, path);
2940         clear_extent_dirty(&info->free_space_cache,
2941                            shrink_start, shrink_last_byte - 1,
2942                            GFP_NOFS);
2943         btrfs_commit_transaction(trans, root);
2944 out:
2945         btrfs_free_path(path);
2946         return ret;
2947 }
2948
2949 int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
2950                            struct btrfs_key *key)
2951 {
2952         int ret;
2953         struct btrfs_key found_key;
2954         struct extent_buffer *leaf;
2955         int slot;
2956
2957         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2958         if (ret < 0)
2959                 return ret;
2960         while(1) {
2961                 slot = path->slots[0];
2962                 leaf = path->nodes[0];
2963                 if (slot >= btrfs_header_nritems(leaf)) {
2964                         ret = btrfs_next_leaf(root, path);
2965                         if (ret == 0)
2966                                 continue;
2967                         if (ret < 0)
2968                                 goto error;
2969                         break;
2970                 }
2971                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2972
2973                 if (found_key.objectid >= key->objectid &&
2974                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY)
2975                         return 0;
2976                 path->slots[0]++;
2977         }
2978         ret = -ENOENT;
2979 error:
2980         return ret;
2981 }
2982
2983 int btrfs_read_block_groups(struct btrfs_root *root)
2984 {
2985         struct btrfs_path *path;
2986         int ret;
2987         int bit;
2988         struct btrfs_block_group_cache *cache;
2989         struct btrfs_fs_info *info = root->fs_info;
2990         struct btrfs_space_info *space_info;
2991         struct extent_io_tree *block_group_cache;
2992         struct btrfs_key key;
2993         struct btrfs_key found_key;
2994         struct extent_buffer *leaf;
2995
2996         block_group_cache = &info->block_group_cache;
2997         root = info->extent_root;
2998         key.objectid = 0;
2999         key.offset = 0;
3000         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
3001         path = btrfs_alloc_path();
3002         if (!path)
3003                 return -ENOMEM;
3004
3005         while(1) {
3006                 ret = find_first_block_group(root, path, &key);
3007                 if (ret > 0) {
3008                         ret = 0;
3009                         goto error;
3010                 }
3011                 if (ret != 0)
3012                         goto error;
3013
3014                 leaf = path->nodes[0];
3015                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3016                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
3017                 if (!cache) {
3018                         ret = -ENOMEM;
3019                         break;
3020                 }
3021
3022                 read_extent_buffer(leaf, &cache->item,
3023                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
3024                                    sizeof(cache->item));
3025                 memcpy(&cache->key, &found_key, sizeof(found_key));
3026
3027                 key.objectid = found_key.objectid + found_key.offset;
3028                 btrfs_release_path(root, path);
3029                 cache->flags = btrfs_block_group_flags(&cache->item);
3030                 bit = 0;
3031                 if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
3032                         bit = BLOCK_GROUP_DATA;
3033                 } else if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
3034                         bit = BLOCK_GROUP_SYSTEM;
3035                 } else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
3036                         bit = BLOCK_GROUP_METADATA;
3037                 }
3038                 set_avail_alloc_bits(info, cache->flags);
3039
3040                 ret = update_space_info(info, cache->flags, found_key.offset,
3041                                         btrfs_block_group_used(&cache->item),
3042                                         &space_info);
3043                 BUG_ON(ret);
3044                 cache->space_info = space_info;
3045
3046                 /* use EXTENT_LOCKED to prevent merging */
3047                 set_extent_bits(block_group_cache, found_key.objectid,
3048                                 found_key.objectid + found_key.offset - 1,
3049                                 bit | EXTENT_LOCKED, GFP_NOFS);
3050                 set_state_private(block_group_cache, found_key.objectid,
3051                                   (unsigned long)cache);
3052
3053                 if (key.objectid >=
3054                     btrfs_super_total_bytes(&info->super_copy))
3055                         break;
3056         }
3057         ret = 0;
3058 error:
3059         btrfs_free_path(path);
3060         return ret;
3061 }
3062
3063 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3064                            struct btrfs_root *root, u64 bytes_used,
3065                            u64 type, u64 chunk_objectid, u64 chunk_offset,
3066                            u64 size)
3067 {
3068         int ret;
3069         int bit = 0;
3070         struct btrfs_root *extent_root;
3071         struct btrfs_block_group_cache *cache;
3072         struct extent_io_tree *block_group_cache;
3073
3074         extent_root = root->fs_info->extent_root;
3075         block_group_cache = &root->fs_info->block_group_cache;
3076
3077         cache = kzalloc(sizeof(*cache), GFP_NOFS);
3078         BUG_ON(!cache);
3079         cache->key.objectid = chunk_offset;
3080         cache->key.offset = size;
3081
3082         btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
3083         memset(&cache->item, 0, sizeof(cache->item));
3084         btrfs_set_block_group_used(&cache->item, bytes_used);
3085         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
3086         cache->flags = type;
3087         btrfs_set_block_group_flags(&cache->item, type);
3088
3089         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
3090                                 &cache->space_info);
3091         BUG_ON(ret);
3092
3093         bit = block_group_state_bits(type);
3094         set_extent_bits(block_group_cache, chunk_offset,
3095                         chunk_offset + size - 1,
3096                         bit | EXTENT_LOCKED, GFP_NOFS);
3097
3098         set_state_private(block_group_cache, chunk_offset,
3099                           (unsigned long)cache);
3100         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
3101                                 sizeof(cache->item));
3102         BUG_ON(ret);
3103
3104         finish_current_insert(trans, extent_root);
3105         ret = del_pending_extents(trans, extent_root);
3106         BUG_ON(ret);
3107         set_avail_alloc_bits(extent_root->fs_info, type);
3108         return 0;
3109 }