Btrfs: alloc_mutex latency reduction
[pandora-kernel.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include "hash.h"
22 #include "crc32c.h"
23 #include "ctree.h"
24 #include "disk-io.h"
25 #include "print-tree.h"
26 #include "transaction.h"
27 #include "volumes.h"
28 #include "locking.h"
29
30 #define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
31 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
32 #define BLOCK_GROUP_SYSTEM   EXTENT_NEW
33
34 #define BLOCK_GROUP_DIRTY EXTENT_DIRTY
35
36 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
37                                  btrfs_root *extent_root);
38 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
39                                btrfs_root *extent_root);
40 static struct btrfs_block_group_cache *
41 __btrfs_find_block_group(struct btrfs_root *root,
42                          struct btrfs_block_group_cache *hint,
43                          u64 search_start, int data, int owner);
44
45 void maybe_lock_mutex(struct btrfs_root *root)
46 {
47         if (root != root->fs_info->extent_root &&
48             root != root->fs_info->chunk_root &&
49             root != root->fs_info->dev_root) {
50                 mutex_lock(&root->fs_info->alloc_mutex);
51         }
52 }
53
54 void maybe_unlock_mutex(struct btrfs_root *root)
55 {
56         if (root != root->fs_info->extent_root &&
57             root != root->fs_info->chunk_root &&
58             root != root->fs_info->dev_root) {
59                 mutex_unlock(&root->fs_info->alloc_mutex);
60         }
61 }
62
63 static int cache_block_group(struct btrfs_root *root,
64                              struct btrfs_block_group_cache *block_group)
65 {
66         struct btrfs_path *path;
67         int ret;
68         struct btrfs_key key;
69         struct extent_buffer *leaf;
70         struct extent_io_tree *free_space_cache;
71         int slot;
72         u64 last = 0;
73         u64 hole_size;
74         u64 first_free;
75         int found = 0;
76
77         if (!block_group)
78                 return 0;
79
80         root = root->fs_info->extent_root;
81         free_space_cache = &root->fs_info->free_space_cache;
82
83         if (block_group->cached)
84                 return 0;
85
86         path = btrfs_alloc_path();
87         if (!path)
88                 return -ENOMEM;
89
90         path->reada = 2;
91         /*
92          * we get into deadlocks with paths held by callers of this function.
93          * since the alloc_mutex is protecting things right now, just
94          * skip the locking here
95          */
96         path->skip_locking = 1;
97         first_free = block_group->key.objectid;
98         key.objectid = block_group->key.objectid;
99         key.offset = 0;
100         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
101         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
102         if (ret < 0)
103                 return ret;
104         ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
105         if (ret < 0)
106                 return ret;
107         if (ret == 0) {
108                 leaf = path->nodes[0];
109                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
110                 if (key.objectid + key.offset > first_free)
111                         first_free = key.objectid + key.offset;
112         }
113         while(1) {
114                 leaf = path->nodes[0];
115                 slot = path->slots[0];
116                 if (slot >= btrfs_header_nritems(leaf)) {
117                         ret = btrfs_next_leaf(root, path);
118                         if (ret < 0)
119                                 goto err;
120                         if (ret == 0) {
121                                 continue;
122                         } else {
123                                 break;
124                         }
125                 }
126                 btrfs_item_key_to_cpu(leaf, &key, slot);
127                 if (key.objectid < block_group->key.objectid) {
128                         goto next;
129                 }
130                 if (key.objectid >= block_group->key.objectid +
131                     block_group->key.offset) {
132                         break;
133                 }
134
135                 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
136                         if (!found) {
137                                 last = first_free;
138                                 found = 1;
139                         }
140                         if (key.objectid > last) {
141                                 hole_size = key.objectid - last;
142                                 set_extent_dirty(free_space_cache, last,
143                                                  last + hole_size - 1,
144                                                  GFP_NOFS);
145                         }
146                         last = key.objectid + key.offset;
147                 }
148 next:
149                 path->slots[0]++;
150         }
151
152         if (!found)
153                 last = first_free;
154         if (block_group->key.objectid +
155             block_group->key.offset > last) {
156                 hole_size = block_group->key.objectid +
157                         block_group->key.offset - last;
158                 set_extent_dirty(free_space_cache, last,
159                                  last + hole_size - 1, GFP_NOFS);
160         }
161         block_group->cached = 1;
162 err:
163         btrfs_free_path(path);
164         return 0;
165 }
166
167 struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
168                                                        btrfs_fs_info *info,
169                                                          u64 bytenr)
170 {
171         struct extent_io_tree *block_group_cache;
172         struct btrfs_block_group_cache *block_group = NULL;
173         u64 ptr;
174         u64 start;
175         u64 end;
176         int ret;
177
178         bytenr = max_t(u64, bytenr,
179                        BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
180         block_group_cache = &info->block_group_cache;
181         ret = find_first_extent_bit(block_group_cache,
182                                     bytenr, &start, &end,
183                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
184                                     BLOCK_GROUP_SYSTEM);
185         if (ret) {
186                 return NULL;
187         }
188         ret = get_state_private(block_group_cache, start, &ptr);
189         if (ret)
190                 return NULL;
191
192         block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
193         return block_group;
194 }
195
196 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
197                                                          btrfs_fs_info *info,
198                                                          u64 bytenr)
199 {
200         struct extent_io_tree *block_group_cache;
201         struct btrfs_block_group_cache *block_group = NULL;
202         u64 ptr;
203         u64 start;
204         u64 end;
205         int ret;
206
207         bytenr = max_t(u64, bytenr,
208                        BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
209         block_group_cache = &info->block_group_cache;
210         ret = find_first_extent_bit(block_group_cache,
211                                     bytenr, &start, &end,
212                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
213                                     BLOCK_GROUP_SYSTEM);
214         if (ret) {
215                 return NULL;
216         }
217         ret = get_state_private(block_group_cache, start, &ptr);
218         if (ret)
219                 return NULL;
220
221         block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
222         if (block_group->key.objectid <= bytenr && bytenr <
223             block_group->key.objectid + block_group->key.offset)
224                 return block_group;
225         return NULL;
226 }
227
228 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
229 {
230         return (cache->flags & bits) == bits;
231 }
232
233 static int noinline find_search_start(struct btrfs_root *root,
234                               struct btrfs_block_group_cache **cache_ret,
235                               u64 *start_ret, u64 num, int data)
236 {
237         int ret;
238         struct btrfs_block_group_cache *cache = *cache_ret;
239         struct extent_io_tree *free_space_cache;
240         struct extent_state *state;
241         u64 last;
242         u64 start = 0;
243         u64 cache_miss = 0;
244         u64 total_fs_bytes;
245         u64 search_start = *start_ret;
246         int wrapped = 0;
247
248         WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
249         total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
250         free_space_cache = &root->fs_info->free_space_cache;
251
252         if (!cache)
253                 goto out;
254
255 again:
256         ret = cache_block_group(root, cache);
257         if (ret) {
258                 goto out;
259         }
260
261         last = max(search_start, cache->key.objectid);
262         if (!block_group_bits(cache, data) || cache->ro)
263                 goto new_group;
264
265         spin_lock_irq(&free_space_cache->lock);
266         state = find_first_extent_bit_state(free_space_cache, last, EXTENT_DIRTY);
267         while(1) {
268                 if (!state) {
269                         if (!cache_miss)
270                                 cache_miss = last;
271                         spin_unlock_irq(&free_space_cache->lock);
272                         goto new_group;
273                 }
274
275                 start = max(last, state->start);
276                 last = state->end + 1;
277                 if (last - start < num) {
278                         do {
279                                 state = extent_state_next(state);
280                         } while(state && !(state->state & EXTENT_DIRTY));
281                         continue;
282                 }
283                 spin_unlock_irq(&free_space_cache->lock);
284                 if (cache->ro) {
285                         goto new_group;
286                 }
287                 if (start + num > cache->key.objectid + cache->key.offset)
288                         goto new_group;
289                 if (!block_group_bits(cache, data)) {
290                         printk("block group bits don't match %Lu %d\n", cache->flags, data);
291                 }
292                 *start_ret = start;
293                 return 0;
294         }
295 out:
296         cache = btrfs_lookup_block_group(root->fs_info, search_start);
297         if (!cache) {
298                 printk("Unable to find block group for %Lu\n", search_start);
299                 WARN_ON(1);
300         }
301         return -ENOSPC;
302
303 new_group:
304         last = cache->key.objectid + cache->key.offset;
305 wrapped:
306         cache = btrfs_lookup_first_block_group(root->fs_info, last);
307         if (!cache || cache->key.objectid >= total_fs_bytes) {
308 no_cache:
309                 if (!wrapped) {
310                         wrapped = 1;
311                         last = search_start;
312                         goto wrapped;
313                 }
314                 goto out;
315         }
316         if (cache_miss && !cache->cached) {
317                 cache_block_group(root, cache);
318                 last = cache_miss;
319                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
320         }
321         cache_miss = 0;
322         cache = btrfs_find_block_group(root, cache, last, data, 0);
323         if (!cache)
324                 goto no_cache;
325         *cache_ret = cache;
326         goto again;
327 }
328
329 static u64 div_factor(u64 num, int factor)
330 {
331         if (factor == 10)
332                 return num;
333         num *= factor;
334         do_div(num, 10);
335         return num;
336 }
337
338 static int block_group_state_bits(u64 flags)
339 {
340         int bits = 0;
341         if (flags & BTRFS_BLOCK_GROUP_DATA)
342                 bits |= BLOCK_GROUP_DATA;
343         if (flags & BTRFS_BLOCK_GROUP_METADATA)
344                 bits |= BLOCK_GROUP_METADATA;
345         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
346                 bits |= BLOCK_GROUP_SYSTEM;
347         return bits;
348 }
349
350 static struct btrfs_block_group_cache *
351 __btrfs_find_block_group(struct btrfs_root *root,
352                          struct btrfs_block_group_cache *hint,
353                          u64 search_start, int data, int owner)
354 {
355         struct btrfs_block_group_cache *cache;
356         struct extent_io_tree *block_group_cache;
357         struct btrfs_block_group_cache *found_group = NULL;
358         struct btrfs_fs_info *info = root->fs_info;
359         u64 used;
360         u64 last = 0;
361         u64 start;
362         u64 end;
363         u64 free_check;
364         u64 ptr;
365         int bit;
366         int ret;
367         int full_search = 0;
368         int factor = 10;
369         int wrapped = 0;
370
371         block_group_cache = &info->block_group_cache;
372
373         if (data & BTRFS_BLOCK_GROUP_METADATA)
374                 factor = 9;
375
376         bit = block_group_state_bits(data);
377
378         if (search_start) {
379                 struct btrfs_block_group_cache *shint;
380                 shint = btrfs_lookup_first_block_group(info, search_start);
381                 if (shint && block_group_bits(shint, data) && !shint->ro) {
382                         spin_lock(&shint->lock);
383                         used = btrfs_block_group_used(&shint->item);
384                         if (used + shint->pinned <
385                             div_factor(shint->key.offset, factor)) {
386                                 spin_unlock(&shint->lock);
387                                 return shint;
388                         }
389                         spin_unlock(&shint->lock);
390                 }
391         }
392         if (hint && !hint->ro && block_group_bits(hint, data)) {
393                 spin_lock(&hint->lock);
394                 used = btrfs_block_group_used(&hint->item);
395                 if (used + hint->pinned <
396                     div_factor(hint->key.offset, factor)) {
397                         spin_unlock(&hint->lock);
398                         return hint;
399                 }
400                 spin_unlock(&hint->lock);
401                 last = hint->key.objectid + hint->key.offset;
402         } else {
403                 if (hint)
404                         last = max(hint->key.objectid, search_start);
405                 else
406                         last = search_start;
407         }
408 again:
409         while(1) {
410                 ret = find_first_extent_bit(block_group_cache, last,
411                                             &start, &end, bit);
412                 if (ret)
413                         break;
414
415                 ret = get_state_private(block_group_cache, start, &ptr);
416                 if (ret) {
417                         last = end + 1;
418                         continue;
419                 }
420
421                 cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
422                 spin_lock(&cache->lock);
423                 last = cache->key.objectid + cache->key.offset;
424                 used = btrfs_block_group_used(&cache->item);
425
426                 if (!cache->ro && block_group_bits(cache, data)) {
427                         free_check = div_factor(cache->key.offset, factor);
428                         if (used + cache->pinned < free_check) {
429                                 found_group = cache;
430                                 spin_unlock(&cache->lock);
431                                 goto found;
432                         }
433                 }
434                 spin_unlock(&cache->lock);
435                 cond_resched();
436         }
437         if (!wrapped) {
438                 last = search_start;
439                 wrapped = 1;
440                 goto again;
441         }
442         if (!full_search && factor < 10) {
443                 last = search_start;
444                 full_search = 1;
445                 factor = 10;
446                 goto again;
447         }
448 found:
449         return found_group;
450 }
451
452 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
453                                                  struct btrfs_block_group_cache
454                                                  *hint, u64 search_start,
455                                                  int data, int owner)
456 {
457
458         struct btrfs_block_group_cache *ret;
459         ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
460         return ret;
461 }
462 static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
463                            u64 owner, u64 owner_offset)
464 {
465         u32 high_crc = ~(u32)0;
466         u32 low_crc = ~(u32)0;
467         __le64 lenum;
468         lenum = cpu_to_le64(root_objectid);
469         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
470         lenum = cpu_to_le64(ref_generation);
471         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
472         if (owner >= BTRFS_FIRST_FREE_OBJECTID) {
473                 lenum = cpu_to_le64(owner);
474                 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
475                 lenum = cpu_to_le64(owner_offset);
476                 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
477         }
478         return ((u64)high_crc << 32) | (u64)low_crc;
479 }
480
481 static int match_extent_ref(struct extent_buffer *leaf,
482                             struct btrfs_extent_ref *disk_ref,
483                             struct btrfs_extent_ref *cpu_ref)
484 {
485         int ret;
486         int len;
487
488         if (cpu_ref->objectid)
489                 len = sizeof(*cpu_ref);
490         else
491                 len = 2 * sizeof(u64);
492         ret = memcmp_extent_buffer(leaf, cpu_ref, (unsigned long)disk_ref,
493                                    len);
494         return ret == 0;
495 }
496
497 static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
498                                           struct btrfs_root *root,
499                                           struct btrfs_path *path, u64 bytenr,
500                                           u64 root_objectid,
501                                           u64 ref_generation, u64 owner,
502                                           u64 owner_offset, int del)
503 {
504         u64 hash;
505         struct btrfs_key key;
506         struct btrfs_key found_key;
507         struct btrfs_extent_ref ref;
508         struct extent_buffer *leaf;
509         struct btrfs_extent_ref *disk_ref;
510         int ret;
511         int ret2;
512
513         btrfs_set_stack_ref_root(&ref, root_objectid);
514         btrfs_set_stack_ref_generation(&ref, ref_generation);
515         btrfs_set_stack_ref_objectid(&ref, owner);
516         btrfs_set_stack_ref_offset(&ref, owner_offset);
517
518         hash = hash_extent_ref(root_objectid, ref_generation, owner,
519                                owner_offset);
520         key.offset = hash;
521         key.objectid = bytenr;
522         key.type = BTRFS_EXTENT_REF_KEY;
523
524         while (1) {
525                 ret = btrfs_search_slot(trans, root, &key, path,
526                                         del ? -1 : 0, del);
527                 if (ret < 0)
528                         goto out;
529                 leaf = path->nodes[0];
530                 if (ret != 0) {
531                         u32 nritems = btrfs_header_nritems(leaf);
532                         if (path->slots[0] >= nritems) {
533                                 ret2 = btrfs_next_leaf(root, path);
534                                 if (ret2)
535                                         goto out;
536                                 leaf = path->nodes[0];
537                         }
538                         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
539                         if (found_key.objectid != bytenr ||
540                             found_key.type != BTRFS_EXTENT_REF_KEY)
541                                 goto out;
542                         key.offset = found_key.offset;
543                         if (del) {
544                                 btrfs_release_path(root, path);
545                                 continue;
546                         }
547                 }
548                 disk_ref = btrfs_item_ptr(path->nodes[0],
549                                           path->slots[0],
550                                           struct btrfs_extent_ref);
551                 if (match_extent_ref(path->nodes[0], disk_ref, &ref)) {
552                         ret = 0;
553                         goto out;
554                 }
555                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
556                 key.offset = found_key.offset + 1;
557                 btrfs_release_path(root, path);
558         }
559 out:
560         return ret;
561 }
562
563 /*
564  * Back reference rules.  Back refs have three main goals:
565  *
566  * 1) differentiate between all holders of references to an extent so that
567  *    when a reference is dropped we can make sure it was a valid reference
568  *    before freeing the extent.
569  *
570  * 2) Provide enough information to quickly find the holders of an extent
571  *    if we notice a given block is corrupted or bad.
572  *
573  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
574  *    maintenance.  This is actually the same as #2, but with a slightly
575  *    different use case.
576  *
577  * File extents can be referenced by:
578  *
579  * - multiple snapshots, subvolumes, or different generations in one subvol
580  * - different files inside a single subvolume (in theory, not implemented yet)
581  * - different offsets inside a file (bookend extents in file.c)
582  *
583  * The extent ref structure has fields for:
584  *
585  * - Objectid of the subvolume root
586  * - Generation number of the tree holding the reference
587  * - objectid of the file holding the reference
588  * - offset in the file corresponding to the key holding the reference
589  *
590  * When a file extent is allocated the fields are filled in:
591  *     (root_key.objectid, trans->transid, inode objectid, offset in file)
592  *
593  * When a leaf is cow'd new references are added for every file extent found
594  * in the leaf.  It looks the same as the create case, but trans->transid
595  * will be different when the block is cow'd.
596  *
597  *     (root_key.objectid, trans->transid, inode objectid, offset in file)
598  *
599  * When a file extent is removed either during snapshot deletion or file
600  * truncation, the corresponding back reference is found
601  * by searching for:
602  *
603  *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
604  *      inode objectid, offset in file)
605  *
606  * Btree extents can be referenced by:
607  *
608  * - Different subvolumes
609  * - Different generations of the same subvolume
610  *
611  * Storing sufficient information for a full reverse mapping of a btree
612  * block would require storing the lowest key of the block in the backref,
613  * and it would require updating that lowest key either before write out or
614  * every time it changed.  Instead, the objectid of the lowest key is stored
615  * along with the level of the tree block.  This provides a hint
616  * about where in the btree the block can be found.  Searches through the
617  * btree only need to look for a pointer to that block, so they stop one
618  * level higher than the level recorded in the backref.
619  *
620  * Some btrees do not do reference counting on their extents.  These
621  * include the extent tree and the tree of tree roots.  Backrefs for these
622  * trees always have a generation of zero.
623  *
624  * When a tree block is created, back references are inserted:
625  *
626  * (root->root_key.objectid, trans->transid or zero, level, lowest_key_objectid)
627  *
628  * When a tree block is cow'd in a reference counted root,
629  * new back references are added for all the blocks it points to.
630  * These are of the form (trans->transid will have increased since creation):
631  *
632  * (root->root_key.objectid, trans->transid, level, lowest_key_objectid)
633  *
634  * Because the lowest_key_objectid and the level are just hints
635  * they are not used when backrefs are deleted.  When a backref is deleted:
636  *
637  * if backref was for a tree root:
638  *     root_objectid = root->root_key.objectid
639  * else
640  *     root_objectid = btrfs_header_owner(parent)
641  *
642  * (root_objectid, btrfs_header_generation(parent) or zero, 0, 0)
643  *
644  * Back Reference Key hashing:
645  *
646  * Back references have four fields, each 64 bits long.  Unfortunately,
647  * This is hashed into a single 64 bit number and placed into the key offset.
648  * The key objectid corresponds to the first byte in the extent, and the
649  * key type is set to BTRFS_EXTENT_REF_KEY
650  */
651 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
652                                  struct btrfs_root *root,
653                                  struct btrfs_path *path, u64 bytenr,
654                                  u64 root_objectid, u64 ref_generation,
655                                  u64 owner, u64 owner_offset)
656 {
657         u64 hash;
658         struct btrfs_key key;
659         struct btrfs_extent_ref ref;
660         struct btrfs_extent_ref *disk_ref;
661         int ret;
662
663         btrfs_set_stack_ref_root(&ref, root_objectid);
664         btrfs_set_stack_ref_generation(&ref, ref_generation);
665         btrfs_set_stack_ref_objectid(&ref, owner);
666         btrfs_set_stack_ref_offset(&ref, owner_offset);
667
668         hash = hash_extent_ref(root_objectid, ref_generation, owner,
669                                owner_offset);
670         key.offset = hash;
671         key.objectid = bytenr;
672         key.type = BTRFS_EXTENT_REF_KEY;
673
674         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(ref));
675         while (ret == -EEXIST) {
676                 disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
677                                           struct btrfs_extent_ref);
678                 if (match_extent_ref(path->nodes[0], disk_ref, &ref))
679                         goto out;
680                 key.offset++;
681                 btrfs_release_path(root, path);
682                 ret = btrfs_insert_empty_item(trans, root, path, &key,
683                                               sizeof(ref));
684         }
685         if (ret)
686                 goto out;
687         disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
688                                   struct btrfs_extent_ref);
689         write_extent_buffer(path->nodes[0], &ref, (unsigned long)disk_ref,
690                             sizeof(ref));
691         btrfs_mark_buffer_dirty(path->nodes[0]);
692 out:
693         btrfs_release_path(root, path);
694         return ret;
695 }
696
697 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
698                                 struct btrfs_root *root,
699                                 u64 bytenr, u64 num_bytes,
700                                 u64 root_objectid, u64 ref_generation,
701                                 u64 owner, u64 owner_offset)
702 {
703         struct btrfs_path *path;
704         int ret;
705         struct btrfs_key key;
706         struct extent_buffer *l;
707         struct btrfs_extent_item *item;
708         u32 refs;
709
710         WARN_ON(num_bytes < root->sectorsize);
711         path = btrfs_alloc_path();
712         if (!path)
713                 return -ENOMEM;
714
715         path->reada = 1;
716         key.objectid = bytenr;
717         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
718         key.offset = num_bytes;
719         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
720                                 0, 1);
721         if (ret < 0)
722                 return ret;
723         if (ret != 0) {
724                 BUG();
725         }
726         BUG_ON(ret != 0);
727         l = path->nodes[0];
728         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
729         refs = btrfs_extent_refs(l, item);
730         btrfs_set_extent_refs(l, item, refs + 1);
731         btrfs_mark_buffer_dirty(path->nodes[0]);
732
733         btrfs_release_path(root->fs_info->extent_root, path);
734
735         path->reada = 1;
736         ret = btrfs_insert_extent_backref(trans, root->fs_info->extent_root,
737                                           path, bytenr, root_objectid,
738                                           ref_generation, owner, owner_offset);
739         BUG_ON(ret);
740         finish_current_insert(trans, root->fs_info->extent_root);
741         del_pending_extents(trans, root->fs_info->extent_root);
742
743         btrfs_free_path(path);
744         return 0;
745 }
746
747 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
748                                 struct btrfs_root *root,
749                                 u64 bytenr, u64 num_bytes,
750                                 u64 root_objectid, u64 ref_generation,
751                                 u64 owner, u64 owner_offset)
752 {
753         int ret;
754
755         mutex_lock(&root->fs_info->alloc_mutex);
756         ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
757                                      root_objectid, ref_generation,
758                                      owner, owner_offset);
759         mutex_unlock(&root->fs_info->alloc_mutex);
760         return ret;
761 }
762
763 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
764                          struct btrfs_root *root)
765 {
766         finish_current_insert(trans, root->fs_info->extent_root);
767         del_pending_extents(trans, root->fs_info->extent_root);
768         return 0;
769 }
770
771 static int lookup_extent_ref(struct btrfs_trans_handle *trans,
772                              struct btrfs_root *root, u64 bytenr,
773                              u64 num_bytes, u32 *refs)
774 {
775         struct btrfs_path *path;
776         int ret;
777         struct btrfs_key key;
778         struct extent_buffer *l;
779         struct btrfs_extent_item *item;
780
781         WARN_ON(num_bytes < root->sectorsize);
782         path = btrfs_alloc_path();
783         path->reada = 1;
784         key.objectid = bytenr;
785         key.offset = num_bytes;
786         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
787         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
788                                 0, 0);
789         if (ret < 0)
790                 goto out;
791         if (ret != 0) {
792                 btrfs_print_leaf(root, path->nodes[0]);
793                 printk("failed to find block number %Lu\n", bytenr);
794                 BUG();
795         }
796         l = path->nodes[0];
797         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
798         *refs = btrfs_extent_refs(l, item);
799 out:
800         btrfs_free_path(path);
801         return 0;
802 }
803
804 u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
805                                   struct btrfs_path *count_path,
806                                   u64 expected_owner,
807                                   u64 first_extent)
808 {
809         struct btrfs_root *extent_root = root->fs_info->extent_root;
810         struct btrfs_path *path;
811         u64 bytenr;
812         u64 found_objectid;
813         u64 found_owner;
814         u64 root_objectid = root->root_key.objectid;
815         u32 total_count = 0;
816         u32 extent_refs;
817         u32 cur_count;
818         u32 nritems;
819         int ret;
820         struct btrfs_key key;
821         struct btrfs_key found_key;
822         struct extent_buffer *l;
823         struct btrfs_extent_item *item;
824         struct btrfs_extent_ref *ref_item;
825         int level = -1;
826
827         /* FIXME, needs locking */
828         BUG();
829
830         mutex_lock(&root->fs_info->alloc_mutex);
831         path = btrfs_alloc_path();
832 again:
833         if (level == -1)
834                 bytenr = first_extent;
835         else
836                 bytenr = count_path->nodes[level]->start;
837
838         cur_count = 0;
839         key.objectid = bytenr;
840         key.offset = 0;
841
842         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
843         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
844         if (ret < 0)
845                 goto out;
846         BUG_ON(ret == 0);
847
848         l = path->nodes[0];
849         btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
850
851         if (found_key.objectid != bytenr ||
852             found_key.type != BTRFS_EXTENT_ITEM_KEY) {
853                 goto out;
854         }
855
856         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
857         extent_refs = btrfs_extent_refs(l, item);
858         while (1) {
859                 l = path->nodes[0];
860                 nritems = btrfs_header_nritems(l);
861                 if (path->slots[0] >= nritems) {
862                         ret = btrfs_next_leaf(extent_root, path);
863                         if (ret == 0)
864                                 continue;
865                         break;
866                 }
867                 btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
868                 if (found_key.objectid != bytenr)
869                         break;
870
871                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
872                         path->slots[0]++;
873                         continue;
874                 }
875
876                 cur_count++;
877                 ref_item = btrfs_item_ptr(l, path->slots[0],
878                                           struct btrfs_extent_ref);
879                 found_objectid = btrfs_ref_root(l, ref_item);
880
881                 if (found_objectid != root_objectid) {
882                         total_count = 2;
883                         goto out;
884                 }
885                 if (level == -1) {
886                         found_owner = btrfs_ref_objectid(l, ref_item);
887                         if (found_owner != expected_owner) {
888                                 total_count = 2;
889                                 goto out;
890                         }
891                         /*
892                          * nasty.  we don't count a reference held by
893                          * the running transaction.  This allows nodatacow
894                          * to avoid cow most of the time
895                          */
896                         if (found_owner >= BTRFS_FIRST_FREE_OBJECTID &&
897                             btrfs_ref_generation(l, ref_item) ==
898                             root->fs_info->generation) {
899                                 extent_refs--;
900                         }
901                 }
902                 total_count = 1;
903                 path->slots[0]++;
904         }
905         /*
906          * if there is more than one reference against a data extent,
907          * we have to assume the other ref is another snapshot
908          */
909         if (level == -1 && extent_refs > 1) {
910                 total_count = 2;
911                 goto out;
912         }
913         if (cur_count == 0) {
914                 total_count = 0;
915                 goto out;
916         }
917         if (level >= 0 && root->node == count_path->nodes[level])
918                 goto out;
919         level++;
920         btrfs_release_path(root, path);
921         goto again;
922
923 out:
924         btrfs_free_path(path);
925         mutex_unlock(&root->fs_info->alloc_mutex);
926         return total_count;
927 }
928
929 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
930                   struct extent_buffer *buf)
931 {
932         u64 bytenr;
933         u32 nritems;
934         struct btrfs_key key;
935         struct btrfs_file_extent_item *fi;
936         int i;
937         int level;
938         int ret;
939         int faili;
940
941         if (!root->ref_cows)
942                 return 0;
943
944         level = btrfs_header_level(buf);
945         nritems = btrfs_header_nritems(buf);
946         for (i = 0; i < nritems; i++) {
947                 cond_resched();
948                 if (level == 0) {
949                         u64 disk_bytenr;
950                         btrfs_item_key_to_cpu(buf, &key, i);
951                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
952                                 continue;
953                         fi = btrfs_item_ptr(buf, i,
954                                             struct btrfs_file_extent_item);
955                         if (btrfs_file_extent_type(buf, fi) ==
956                             BTRFS_FILE_EXTENT_INLINE)
957                                 continue;
958                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
959                         if (disk_bytenr == 0)
960                                 continue;
961
962                         mutex_lock(&root->fs_info->alloc_mutex);
963                         ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr,
964                                     btrfs_file_extent_disk_num_bytes(buf, fi),
965                                     root->root_key.objectid, trans->transid,
966                                     key.objectid, key.offset);
967                         mutex_unlock(&root->fs_info->alloc_mutex);
968                         if (ret) {
969                                 faili = i;
970                                 WARN_ON(1);
971                                 goto fail;
972                         }
973                 } else {
974                         bytenr = btrfs_node_blockptr(buf, i);
975                         btrfs_node_key_to_cpu(buf, &key, i);
976
977                         mutex_lock(&root->fs_info->alloc_mutex);
978                         ret = __btrfs_inc_extent_ref(trans, root, bytenr,
979                                            btrfs_level_size(root, level - 1),
980                                            root->root_key.objectid,
981                                            trans->transid,
982                                            level - 1, key.objectid);
983                         mutex_unlock(&root->fs_info->alloc_mutex);
984                         if (ret) {
985                                 faili = i;
986                                 WARN_ON(1);
987                                 goto fail;
988                         }
989                 }
990         }
991         return 0;
992 fail:
993         WARN_ON(1);
994 #if 0
995         for (i =0; i < faili; i++) {
996                 if (level == 0) {
997                         u64 disk_bytenr;
998                         btrfs_item_key_to_cpu(buf, &key, i);
999                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1000                                 continue;
1001                         fi = btrfs_item_ptr(buf, i,
1002                                             struct btrfs_file_extent_item);
1003                         if (btrfs_file_extent_type(buf, fi) ==
1004                             BTRFS_FILE_EXTENT_INLINE)
1005                                 continue;
1006                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1007                         if (disk_bytenr == 0)
1008                                 continue;
1009                         err = btrfs_free_extent(trans, root, disk_bytenr,
1010                                     btrfs_file_extent_disk_num_bytes(buf,
1011                                                                       fi), 0);
1012                         BUG_ON(err);
1013                 } else {
1014                         bytenr = btrfs_node_blockptr(buf, i);
1015                         err = btrfs_free_extent(trans, root, bytenr,
1016                                         btrfs_level_size(root, level - 1), 0);
1017                         BUG_ON(err);
1018                 }
1019         }
1020 #endif
1021         return ret;
1022 }
1023
1024 static int write_one_cache_group(struct btrfs_trans_handle *trans,
1025                                  struct btrfs_root *root,
1026                                  struct btrfs_path *path,
1027                                  struct btrfs_block_group_cache *cache)
1028 {
1029         int ret;
1030         int pending_ret;
1031         struct btrfs_root *extent_root = root->fs_info->extent_root;
1032         unsigned long bi;
1033         struct extent_buffer *leaf;
1034
1035         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1036         if (ret < 0)
1037                 goto fail;
1038         BUG_ON(ret);
1039
1040         leaf = path->nodes[0];
1041         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1042         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1043         btrfs_mark_buffer_dirty(leaf);
1044         btrfs_release_path(extent_root, path);
1045 fail:
1046         finish_current_insert(trans, extent_root);
1047         pending_ret = del_pending_extents(trans, extent_root);
1048         if (ret)
1049                 return ret;
1050         if (pending_ret)
1051                 return pending_ret;
1052         return 0;
1053
1054 }
1055
1056 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1057                                    struct btrfs_root *root)
1058 {
1059         struct extent_io_tree *block_group_cache;
1060         struct btrfs_block_group_cache *cache;
1061         int ret;
1062         int err = 0;
1063         int werr = 0;
1064         struct btrfs_path *path;
1065         u64 last = 0;
1066         u64 start;
1067         u64 end;
1068         u64 ptr;
1069
1070         block_group_cache = &root->fs_info->block_group_cache;
1071         path = btrfs_alloc_path();
1072         if (!path)
1073                 return -ENOMEM;
1074
1075         mutex_lock(&root->fs_info->alloc_mutex);
1076         while(1) {
1077                 ret = find_first_extent_bit(block_group_cache, last,
1078                                             &start, &end, BLOCK_GROUP_DIRTY);
1079                 if (ret)
1080                         break;
1081
1082                 last = end + 1;
1083                 ret = get_state_private(block_group_cache, start, &ptr);
1084                 if (ret)
1085                         break;
1086                 cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
1087                 err = write_one_cache_group(trans, root,
1088                                             path, cache);
1089                 /*
1090                  * if we fail to write the cache group, we want
1091                  * to keep it marked dirty in hopes that a later
1092                  * write will work
1093                  */
1094                 if (err) {
1095                         werr = err;
1096                         continue;
1097                 }
1098                 clear_extent_bits(block_group_cache, start, end,
1099                                   BLOCK_GROUP_DIRTY, GFP_NOFS);
1100         }
1101         btrfs_free_path(path);
1102         mutex_unlock(&root->fs_info->alloc_mutex);
1103         return werr;
1104 }
1105
1106 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
1107                                                   u64 flags)
1108 {
1109         struct list_head *head = &info->space_info;
1110         struct list_head *cur;
1111         struct btrfs_space_info *found;
1112         list_for_each(cur, head) {
1113                 found = list_entry(cur, struct btrfs_space_info, list);
1114                 if (found->flags == flags)
1115                         return found;
1116         }
1117         return NULL;
1118
1119 }
1120
1121 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1122                              u64 total_bytes, u64 bytes_used,
1123                              struct btrfs_space_info **space_info)
1124 {
1125         struct btrfs_space_info *found;
1126
1127         found = __find_space_info(info, flags);
1128         if (found) {
1129                 found->total_bytes += total_bytes;
1130                 found->bytes_used += bytes_used;
1131                 found->full = 0;
1132                 WARN_ON(found->total_bytes < found->bytes_used);
1133                 *space_info = found;
1134                 return 0;
1135         }
1136         found = kmalloc(sizeof(*found), GFP_NOFS);
1137         if (!found)
1138                 return -ENOMEM;
1139
1140         list_add(&found->list, &info->space_info);
1141         found->flags = flags;
1142         found->total_bytes = total_bytes;
1143         found->bytes_used = bytes_used;
1144         found->bytes_pinned = 0;
1145         found->full = 0;
1146         found->force_alloc = 0;
1147         *space_info = found;
1148         return 0;
1149 }
1150
1151 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1152 {
1153         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1154                                    BTRFS_BLOCK_GROUP_RAID1 |
1155                                    BTRFS_BLOCK_GROUP_RAID10 |
1156                                    BTRFS_BLOCK_GROUP_DUP);
1157         if (extra_flags) {
1158                 if (flags & BTRFS_BLOCK_GROUP_DATA)
1159                         fs_info->avail_data_alloc_bits |= extra_flags;
1160                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1161                         fs_info->avail_metadata_alloc_bits |= extra_flags;
1162                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1163                         fs_info->avail_system_alloc_bits |= extra_flags;
1164         }
1165 }
1166
1167 static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1168 {
1169         u64 num_devices = root->fs_info->fs_devices->num_devices;
1170
1171         if (num_devices == 1)
1172                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1173         if (num_devices < 4)
1174                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1175
1176         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1177             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1178                       BTRFS_BLOCK_GROUP_RAID10))) {
1179                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1180         }
1181
1182         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1183             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1184                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1185         }
1186
1187         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1188             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1189              (flags & BTRFS_BLOCK_GROUP_RAID10) |
1190              (flags & BTRFS_BLOCK_GROUP_DUP)))
1191                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1192         return flags;
1193 }
1194
1195 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1196                           struct btrfs_root *extent_root, u64 alloc_bytes,
1197                           u64 flags, int force)
1198 {
1199         struct btrfs_space_info *space_info;
1200         u64 thresh;
1201         u64 start;
1202         u64 num_bytes;
1203         int ret;
1204
1205         flags = reduce_alloc_profile(extent_root, flags);
1206
1207         space_info = __find_space_info(extent_root->fs_info, flags);
1208         if (!space_info) {
1209                 ret = update_space_info(extent_root->fs_info, flags,
1210                                         0, 0, &space_info);
1211                 BUG_ON(ret);
1212         }
1213         BUG_ON(!space_info);
1214
1215         if (space_info->force_alloc) {
1216                 force = 1;
1217                 space_info->force_alloc = 0;
1218         }
1219         if (space_info->full)
1220                 goto out;
1221
1222         thresh = div_factor(space_info->total_bytes, 6);
1223         if (!force &&
1224            (space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
1225             thresh)
1226                 goto out;
1227
1228         mutex_lock(&extent_root->fs_info->chunk_mutex);
1229         ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
1230         if (ret == -ENOSPC) {
1231 printk("space info full %Lu\n", flags);
1232                 space_info->full = 1;
1233                 goto out_unlock;
1234         }
1235         BUG_ON(ret);
1236
1237         ret = btrfs_make_block_group(trans, extent_root, 0, flags,
1238                      BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
1239         BUG_ON(ret);
1240 out_unlock:
1241         mutex_unlock(&extent_root->fs_info->chunk_mutex);
1242 out:
1243         return 0;
1244 }
1245
1246 static int update_block_group(struct btrfs_trans_handle *trans,
1247                               struct btrfs_root *root,
1248                               u64 bytenr, u64 num_bytes, int alloc,
1249                               int mark_free)
1250 {
1251         struct btrfs_block_group_cache *cache;
1252         struct btrfs_fs_info *info = root->fs_info;
1253         u64 total = num_bytes;
1254         u64 old_val;
1255         u64 byte_in_group;
1256         u64 start;
1257         u64 end;
1258
1259         WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1260         while(total) {
1261                 cache = btrfs_lookup_block_group(info, bytenr);
1262                 if (!cache) {
1263                         return -1;
1264                 }
1265                 byte_in_group = bytenr - cache->key.objectid;
1266                 WARN_ON(byte_in_group > cache->key.offset);
1267                 start = cache->key.objectid;
1268                 end = start + cache->key.offset - 1;
1269                 set_extent_bits(&info->block_group_cache, start, end,
1270                                 BLOCK_GROUP_DIRTY, GFP_NOFS);
1271
1272                 spin_lock(&cache->lock);
1273                 old_val = btrfs_block_group_used(&cache->item);
1274                 num_bytes = min(total, cache->key.offset - byte_in_group);
1275                 if (alloc) {
1276                         old_val += num_bytes;
1277                         cache->space_info->bytes_used += num_bytes;
1278                         btrfs_set_block_group_used(&cache->item, old_val);
1279                         spin_unlock(&cache->lock);
1280                 } else {
1281                         old_val -= num_bytes;
1282                         cache->space_info->bytes_used -= num_bytes;
1283                         btrfs_set_block_group_used(&cache->item, old_val);
1284                         spin_unlock(&cache->lock);
1285                         if (mark_free) {
1286                                 set_extent_dirty(&info->free_space_cache,
1287                                                  bytenr, bytenr + num_bytes - 1,
1288                                                  GFP_NOFS);
1289                         }
1290                 }
1291                 total -= num_bytes;
1292                 bytenr += num_bytes;
1293         }
1294         return 0;
1295 }
1296
1297 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1298 {
1299         u64 start;
1300         u64 end;
1301         int ret;
1302         ret = find_first_extent_bit(&root->fs_info->block_group_cache,
1303                                     search_start, &start, &end,
1304                                     BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
1305                                     BLOCK_GROUP_SYSTEM);
1306         if (ret)
1307                 return 0;
1308         return start;
1309 }
1310
1311
1312 static int update_pinned_extents(struct btrfs_root *root,
1313                                 u64 bytenr, u64 num, int pin)
1314 {
1315         u64 len;
1316         struct btrfs_block_group_cache *cache;
1317         struct btrfs_fs_info *fs_info = root->fs_info;
1318
1319         WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1320         if (pin) {
1321                 set_extent_dirty(&fs_info->pinned_extents,
1322                                 bytenr, bytenr + num - 1, GFP_NOFS);
1323         } else {
1324                 clear_extent_dirty(&fs_info->pinned_extents,
1325                                 bytenr, bytenr + num - 1, GFP_NOFS);
1326         }
1327         while (num > 0) {
1328                 cache = btrfs_lookup_block_group(fs_info, bytenr);
1329                 if (!cache) {
1330                         u64 first = first_logical_byte(root, bytenr);
1331                         WARN_ON(first < bytenr);
1332                         len = min(first - bytenr, num);
1333                 } else {
1334                         len = min(num, cache->key.offset -
1335                                   (bytenr - cache->key.objectid));
1336                 }
1337                 if (pin) {
1338                         if (cache) {
1339                                 spin_lock(&cache->lock);
1340                                 cache->pinned += len;
1341                                 cache->space_info->bytes_pinned += len;
1342                                 spin_unlock(&cache->lock);
1343                         }
1344                         fs_info->total_pinned += len;
1345                 } else {
1346                         if (cache) {
1347                                 spin_lock(&cache->lock);
1348                                 cache->pinned -= len;
1349                                 cache->space_info->bytes_pinned -= len;
1350                                 spin_unlock(&cache->lock);
1351                         }
1352                         fs_info->total_pinned -= len;
1353                 }
1354                 bytenr += len;
1355                 num -= len;
1356         }
1357         return 0;
1358 }
1359
1360 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
1361 {
1362         u64 last = 0;
1363         u64 start;
1364         u64 end;
1365         struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
1366         int ret;
1367
1368         while(1) {
1369                 ret = find_first_extent_bit(pinned_extents, last,
1370                                             &start, &end, EXTENT_DIRTY);
1371                 if (ret)
1372                         break;
1373                 set_extent_dirty(copy, start, end, GFP_NOFS);
1374                 last = end + 1;
1375         }
1376         return 0;
1377 }
1378
1379 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1380                                struct btrfs_root *root,
1381                                struct extent_io_tree *unpin)
1382 {
1383         u64 start;
1384         u64 end;
1385         int ret;
1386         struct extent_io_tree *free_space_cache;
1387         free_space_cache = &root->fs_info->free_space_cache;
1388
1389         mutex_lock(&root->fs_info->alloc_mutex);
1390         while(1) {
1391                 ret = find_first_extent_bit(unpin, 0, &start, &end,
1392                                             EXTENT_DIRTY);
1393                 if (ret)
1394                         break;
1395                 update_pinned_extents(root, start, end + 1 - start, 0);
1396                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
1397                 set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
1398                 if (need_resched()) {
1399                         mutex_unlock(&root->fs_info->alloc_mutex);
1400                         cond_resched();
1401                         mutex_lock(&root->fs_info->alloc_mutex);
1402                 }
1403         }
1404         mutex_unlock(&root->fs_info->alloc_mutex);
1405         return 0;
1406 }
1407
1408 static int finish_current_insert(struct btrfs_trans_handle *trans,
1409                                  struct btrfs_root *extent_root)
1410 {
1411         u64 start;
1412         u64 end;
1413         struct btrfs_fs_info *info = extent_root->fs_info;
1414         struct extent_buffer *eb;
1415         struct btrfs_path *path;
1416         struct btrfs_key ins;
1417         struct btrfs_disk_key first;
1418         struct btrfs_extent_item extent_item;
1419         int ret;
1420         int level;
1421         int err = 0;
1422
1423         WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
1424         btrfs_set_stack_extent_refs(&extent_item, 1);
1425         btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
1426         path = btrfs_alloc_path();
1427
1428         while(1) {
1429                 ret = find_first_extent_bit(&info->extent_ins, 0, &start,
1430                                             &end, EXTENT_LOCKED);
1431                 if (ret)
1432                         break;
1433
1434                 ins.objectid = start;
1435                 ins.offset = end + 1 - start;
1436                 err = btrfs_insert_item(trans, extent_root, &ins,
1437                                         &extent_item, sizeof(extent_item));
1438                 clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
1439                                   GFP_NOFS);
1440
1441                 eb = btrfs_find_tree_block(extent_root, ins.objectid,
1442                                            ins.offset);
1443
1444                 if (!btrfs_buffer_uptodate(eb, trans->transid)) {
1445                         mutex_unlock(&extent_root->fs_info->alloc_mutex);
1446                         btrfs_read_buffer(eb, trans->transid);
1447                         mutex_lock(&extent_root->fs_info->alloc_mutex);
1448                 }
1449
1450                 btrfs_tree_lock(eb);
1451                 level = btrfs_header_level(eb);
1452                 if (level == 0) {
1453                         btrfs_item_key(eb, &first, 0);
1454                 } else {
1455                         btrfs_node_key(eb, &first, 0);
1456                 }
1457                 btrfs_tree_unlock(eb);
1458                 free_extent_buffer(eb);
1459                 /*
1460                  * the first key is just a hint, so the race we've created
1461                  * against reading it is fine
1462                  */
1463                 err = btrfs_insert_extent_backref(trans, extent_root, path,
1464                                           start, extent_root->root_key.objectid,
1465                                           0, level,
1466                                           btrfs_disk_key_objectid(&first));
1467                 BUG_ON(err);
1468                 if (need_resched()) {
1469                         mutex_unlock(&extent_root->fs_info->alloc_mutex);
1470                         cond_resched();
1471                         mutex_lock(&extent_root->fs_info->alloc_mutex);
1472                 }
1473         }
1474         btrfs_free_path(path);
1475         return 0;
1476 }
1477
1478 static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
1479                           int pending)
1480 {
1481         int err = 0;
1482
1483         WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1484         if (!pending) {
1485                 struct extent_buffer *buf;
1486                 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
1487                 if (buf) {
1488                         if (btrfs_try_tree_lock(buf) &&
1489                             btrfs_buffer_uptodate(buf, 0)) {
1490                                 u64 transid =
1491                                     root->fs_info->running_transaction->transid;
1492                                 u64 header_transid =
1493                                         btrfs_header_generation(buf);
1494                                 if (header_transid == transid &&
1495                                     !btrfs_header_flag(buf,
1496                                                BTRFS_HEADER_FLAG_WRITTEN)) {
1497                                         clean_tree_block(NULL, root, buf);
1498                                         btrfs_tree_unlock(buf);
1499                                         free_extent_buffer(buf);
1500                                         return 1;
1501                                 }
1502                                 btrfs_tree_unlock(buf);
1503                         }
1504                         free_extent_buffer(buf);
1505                 }
1506                 update_pinned_extents(root, bytenr, num_bytes, 1);
1507         } else {
1508                 set_extent_bits(&root->fs_info->pending_del,
1509                                 bytenr, bytenr + num_bytes - 1,
1510                                 EXTENT_LOCKED, GFP_NOFS);
1511         }
1512         BUG_ON(err < 0);
1513         return 0;
1514 }
1515
1516 /*
1517  * remove an extent from the root, returns 0 on success
1518  */
1519 static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
1520                          *root, u64 bytenr, u64 num_bytes,
1521                          u64 root_objectid, u64 ref_generation,
1522                          u64 owner_objectid, u64 owner_offset, int pin,
1523                          int mark_free)
1524 {
1525         struct btrfs_path *path;
1526         struct btrfs_key key;
1527         struct btrfs_fs_info *info = root->fs_info;
1528         struct btrfs_root *extent_root = info->extent_root;
1529         struct extent_buffer *leaf;
1530         int ret;
1531         int extent_slot = 0;
1532         int found_extent = 0;
1533         int num_to_del = 1;
1534         struct btrfs_extent_item *ei;
1535         u32 refs;
1536
1537         WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1538         key.objectid = bytenr;
1539         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1540         key.offset = num_bytes;
1541         path = btrfs_alloc_path();
1542         if (!path)
1543                 return -ENOMEM;
1544
1545         path->reada = 1;
1546         ret = lookup_extent_backref(trans, extent_root, path,
1547                                     bytenr, root_objectid,
1548                                     ref_generation,
1549                                     owner_objectid, owner_offset, 1);
1550         if (ret == 0) {
1551                 struct btrfs_key found_key;
1552                 extent_slot = path->slots[0];
1553                 while(extent_slot > 0) {
1554                         extent_slot--;
1555                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1556                                               extent_slot);
1557                         if (found_key.objectid != bytenr)
1558                                 break;
1559                         if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
1560                             found_key.offset == num_bytes) {
1561                                 found_extent = 1;
1562                                 break;
1563                         }
1564                         if (path->slots[0] - extent_slot > 5)
1565                                 break;
1566                 }
1567                 if (!found_extent)
1568                         ret = btrfs_del_item(trans, extent_root, path);
1569         } else {
1570                 btrfs_print_leaf(extent_root, path->nodes[0]);
1571                 WARN_ON(1);
1572                 printk("Unable to find ref byte nr %Lu root %Lu "
1573                        " gen %Lu owner %Lu offset %Lu\n", bytenr,
1574                        root_objectid, ref_generation, owner_objectid,
1575                        owner_offset);
1576         }
1577         if (!found_extent) {
1578                 btrfs_release_path(extent_root, path);
1579                 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
1580                 if (ret < 0)
1581                         return ret;
1582                 BUG_ON(ret);
1583                 extent_slot = path->slots[0];
1584         }
1585
1586         leaf = path->nodes[0];
1587         ei = btrfs_item_ptr(leaf, extent_slot,
1588                             struct btrfs_extent_item);
1589         refs = btrfs_extent_refs(leaf, ei);
1590         BUG_ON(refs == 0);
1591         refs -= 1;
1592         btrfs_set_extent_refs(leaf, ei, refs);
1593
1594         btrfs_mark_buffer_dirty(leaf);
1595
1596         if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
1597                 /* if the back ref and the extent are next to each other
1598                  * they get deleted below in one shot
1599                  */
1600                 path->slots[0] = extent_slot;
1601                 num_to_del = 2;
1602         } else if (found_extent) {
1603                 /* otherwise delete the extent back ref */
1604                 ret = btrfs_del_item(trans, extent_root, path);
1605                 BUG_ON(ret);
1606                 /* if refs are 0, we need to setup the path for deletion */
1607                 if (refs == 0) {
1608                         btrfs_release_path(extent_root, path);
1609                         ret = btrfs_search_slot(trans, extent_root, &key, path,
1610                                                 -1, 1);
1611                         if (ret < 0)
1612                                 return ret;
1613                         BUG_ON(ret);
1614                 }
1615         }
1616
1617         if (refs == 0) {
1618                 u64 super_used;
1619                 u64 root_used;
1620
1621                 if (pin) {
1622                         ret = pin_down_bytes(root, bytenr, num_bytes, 0);
1623                         if (ret > 0)
1624                                 mark_free = 1;
1625                         BUG_ON(ret < 0);
1626                 }
1627
1628                 /* block accounting for super block */
1629                 spin_lock_irq(&info->delalloc_lock);
1630                 super_used = btrfs_super_bytes_used(&info->super_copy);
1631                 btrfs_set_super_bytes_used(&info->super_copy,
1632                                            super_used - num_bytes);
1633                 spin_unlock_irq(&info->delalloc_lock);
1634
1635                 /* block accounting for root item */
1636                 root_used = btrfs_root_used(&root->root_item);
1637                 btrfs_set_root_used(&root->root_item,
1638                                            root_used - num_bytes);
1639                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
1640                                       num_to_del);
1641                 if (ret) {
1642                         return ret;
1643                 }
1644                 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
1645                                          mark_free);
1646                 BUG_ON(ret);
1647         }
1648         btrfs_free_path(path);
1649         finish_current_insert(trans, extent_root);
1650         return ret;
1651 }
1652
1653 /*
1654  * find all the blocks marked as pending in the radix tree and remove
1655  * them from the extent map
1656  */
1657 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
1658                                btrfs_root *extent_root)
1659 {
1660         int ret;
1661         int err = 0;
1662         u64 start;
1663         u64 end;
1664         struct extent_io_tree *pending_del;
1665         struct extent_io_tree *pinned_extents;
1666
1667         WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
1668         pending_del = &extent_root->fs_info->pending_del;
1669         pinned_extents = &extent_root->fs_info->pinned_extents;
1670
1671         while(1) {
1672                 ret = find_first_extent_bit(pending_del, 0, &start, &end,
1673                                             EXTENT_LOCKED);
1674                 if (ret)
1675                         break;
1676                 clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
1677                                   GFP_NOFS);
1678                 if (!test_range_bit(&extent_root->fs_info->extent_ins,
1679                                     start, end, EXTENT_LOCKED, 0)) {
1680                         update_pinned_extents(extent_root, start,
1681                                               end + 1 - start, 1);
1682                         ret = __free_extent(trans, extent_root,
1683                                              start, end + 1 - start,
1684                                              extent_root->root_key.objectid,
1685                                              0, 0, 0, 0, 0);
1686                 } else {
1687                         clear_extent_bits(&extent_root->fs_info->extent_ins,
1688                                           start, end, EXTENT_LOCKED, GFP_NOFS);
1689                 }
1690                 if (ret)
1691                         err = ret;
1692
1693                 if (need_resched()) {
1694                         mutex_unlock(&extent_root->fs_info->alloc_mutex);
1695                         cond_resched();
1696                         mutex_lock(&extent_root->fs_info->alloc_mutex);
1697                 }
1698         }
1699         return err;
1700 }
1701
1702 /*
1703  * remove an extent from the root, returns 0 on success
1704  */
1705 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
1706                                struct btrfs_root *root, u64 bytenr,
1707                                u64 num_bytes, u64 root_objectid,
1708                                u64 ref_generation, u64 owner_objectid,
1709                                u64 owner_offset, int pin)
1710 {
1711         struct btrfs_root *extent_root = root->fs_info->extent_root;
1712         int pending_ret;
1713         int ret;
1714
1715         WARN_ON(num_bytes < root->sectorsize);
1716         if (!root->ref_cows)
1717                 ref_generation = 0;
1718
1719         if (root == extent_root) {
1720                 pin_down_bytes(root, bytenr, num_bytes, 1);
1721                 return 0;
1722         }
1723         ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
1724                             ref_generation, owner_objectid, owner_offset,
1725                             pin, pin == 0);
1726
1727         finish_current_insert(trans, root->fs_info->extent_root);
1728         pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
1729         return ret ? ret : pending_ret;
1730 }
1731
1732 int btrfs_free_extent(struct btrfs_trans_handle *trans,
1733                       struct btrfs_root *root, u64 bytenr,
1734                       u64 num_bytes, u64 root_objectid,
1735                       u64 ref_generation, u64 owner_objectid,
1736                       u64 owner_offset, int pin)
1737 {
1738         int ret;
1739
1740         maybe_lock_mutex(root);
1741         ret = __btrfs_free_extent(trans, root, bytenr, num_bytes,
1742                                   root_objectid, ref_generation,
1743                                   owner_objectid, owner_offset, pin);
1744         maybe_unlock_mutex(root);
1745         return ret;
1746 }
1747
1748 static u64 stripe_align(struct btrfs_root *root, u64 val)
1749 {
1750         u64 mask = ((u64)root->stripesize - 1);
1751         u64 ret = (val + mask) & ~mask;
1752         return ret;
1753 }
1754
1755 /*
1756  * walks the btree of allocated extents and find a hole of a given size.
1757  * The key ins is changed to record the hole:
1758  * ins->objectid == block start
1759  * ins->flags = BTRFS_EXTENT_ITEM_KEY
1760  * ins->offset == number of blocks
1761  * Any available blocks before search_start are skipped.
1762  */
1763 static int noinline find_free_extent(struct btrfs_trans_handle *trans,
1764                                      struct btrfs_root *orig_root,
1765                                      u64 num_bytes, u64 empty_size,
1766                                      u64 search_start, u64 search_end,
1767                                      u64 hint_byte, struct btrfs_key *ins,
1768                                      u64 exclude_start, u64 exclude_nr,
1769                                      int data)
1770 {
1771         int ret;
1772         u64 orig_search_start;
1773         struct btrfs_root * root = orig_root->fs_info->extent_root;
1774         struct btrfs_fs_info *info = root->fs_info;
1775         u64 total_needed = num_bytes;
1776         u64 *last_ptr = NULL;
1777         struct btrfs_block_group_cache *block_group;
1778         int full_scan = 0;
1779         int wrapped = 0;
1780         int chunk_alloc_done = 0;
1781         int empty_cluster = 2 * 1024 * 1024;
1782         int allowed_chunk_alloc = 0;
1783
1784         WARN_ON(num_bytes < root->sectorsize);
1785         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
1786
1787         if (orig_root->ref_cows || empty_size)
1788                 allowed_chunk_alloc = 1;
1789
1790         if (data & BTRFS_BLOCK_GROUP_METADATA) {
1791                 last_ptr = &root->fs_info->last_alloc;
1792                 empty_cluster = 256 * 1024;
1793         }
1794
1795         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
1796                 last_ptr = &root->fs_info->last_data_alloc;
1797         }
1798
1799         if (last_ptr) {
1800                 if (*last_ptr)
1801                         hint_byte = *last_ptr;
1802                 else {
1803                         empty_size += empty_cluster;
1804                 }
1805         }
1806
1807         search_start = max(search_start, first_logical_byte(root, 0));
1808         orig_search_start = search_start;
1809
1810         if (search_end == (u64)-1)
1811                 search_end = btrfs_super_total_bytes(&info->super_copy);
1812
1813         if (hint_byte) {
1814                 block_group = btrfs_lookup_first_block_group(info, hint_byte);
1815                 if (!block_group)
1816                         hint_byte = search_start;
1817                 block_group = btrfs_find_block_group(root, block_group,
1818                                                      hint_byte, data, 1);
1819                 if (last_ptr && *last_ptr == 0 && block_group)
1820                         hint_byte = block_group->key.objectid;
1821         } else {
1822                 block_group = btrfs_find_block_group(root,
1823                                                      trans->block_group,
1824                                                      search_start, data, 1);
1825         }
1826         search_start = max(search_start, hint_byte);
1827
1828         total_needed += empty_size;
1829
1830 check_failed:
1831         if (!block_group) {
1832                 block_group = btrfs_lookup_first_block_group(info,
1833                                                              search_start);
1834                 if (!block_group)
1835                         block_group = btrfs_lookup_first_block_group(info,
1836                                                        orig_search_start);
1837         }
1838         if (full_scan && !chunk_alloc_done) {
1839                 if (allowed_chunk_alloc) {
1840                         do_chunk_alloc(trans, root,
1841                                      num_bytes + 2 * 1024 * 1024, data, 1);
1842                         allowed_chunk_alloc = 0;
1843                 } else if (block_group && block_group_bits(block_group, data)) {
1844                         block_group->space_info->force_alloc = 1;
1845                 }
1846                 chunk_alloc_done = 1;
1847         }
1848         ret = find_search_start(root, &block_group, &search_start,
1849                                 total_needed, data);
1850         if (ret == -ENOSPC && last_ptr && *last_ptr) {
1851                 *last_ptr = 0;
1852                 block_group = btrfs_lookup_first_block_group(info,
1853                                                              orig_search_start);
1854                 search_start = orig_search_start;
1855                 ret = find_search_start(root, &block_group, &search_start,
1856                                         total_needed, data);
1857         }
1858         if (ret == -ENOSPC)
1859                 goto enospc;
1860         if (ret)
1861                 goto error;
1862
1863         if (last_ptr && *last_ptr && search_start != *last_ptr) {
1864                 *last_ptr = 0;
1865                 if (!empty_size) {
1866                         empty_size += empty_cluster;
1867                         total_needed += empty_size;
1868                 }
1869                 block_group = btrfs_lookup_first_block_group(info,
1870                                                        orig_search_start);
1871                 search_start = orig_search_start;
1872                 ret = find_search_start(root, &block_group,
1873                                         &search_start, total_needed, data);
1874                 if (ret == -ENOSPC)
1875                         goto enospc;
1876                 if (ret)
1877                         goto error;
1878         }
1879
1880         search_start = stripe_align(root, search_start);
1881         ins->objectid = search_start;
1882         ins->offset = num_bytes;
1883
1884         if (ins->objectid + num_bytes >= search_end)
1885                 goto enospc;
1886
1887         if (ins->objectid + num_bytes >
1888             block_group->key.objectid + block_group->key.offset) {
1889                 search_start = block_group->key.objectid +
1890                         block_group->key.offset;
1891                 goto new_group;
1892         }
1893
1894         if (test_range_bit(&info->extent_ins, ins->objectid,
1895                            ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) {
1896                 search_start = ins->objectid + num_bytes;
1897                 goto new_group;
1898         }
1899
1900         if (test_range_bit(&info->pinned_extents, ins->objectid,
1901                            ins->objectid + num_bytes -1, EXTENT_DIRTY, 0)) {
1902                 search_start = ins->objectid + num_bytes;
1903                 goto new_group;
1904         }
1905
1906         if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
1907             ins->objectid < exclude_start + exclude_nr)) {
1908                 search_start = exclude_start + exclude_nr;
1909                 goto new_group;
1910         }
1911
1912         if (!(data & BTRFS_BLOCK_GROUP_DATA)) {
1913                 block_group = btrfs_lookup_block_group(info, ins->objectid);
1914                 if (block_group)
1915                         trans->block_group = block_group;
1916         }
1917         ins->offset = num_bytes;
1918         if (last_ptr) {
1919                 *last_ptr = ins->objectid + ins->offset;
1920                 if (*last_ptr ==
1921                     btrfs_super_total_bytes(&root->fs_info->super_copy)) {
1922                         *last_ptr = 0;
1923                 }
1924         }
1925         return 0;
1926
1927 new_group:
1928         if (search_start + num_bytes >= search_end) {
1929 enospc:
1930                 search_start = orig_search_start;
1931                 if (full_scan) {
1932                         ret = -ENOSPC;
1933                         goto error;
1934                 }
1935                 if (wrapped) {
1936                         if (!full_scan)
1937                                 total_needed -= empty_size;
1938                         full_scan = 1;
1939                 } else
1940                         wrapped = 1;
1941         }
1942         block_group = btrfs_lookup_first_block_group(info, search_start);
1943         cond_resched();
1944         block_group = btrfs_find_block_group(root, block_group,
1945                                              search_start, data, 0);
1946         goto check_failed;
1947
1948 error:
1949         return ret;
1950 }
1951
1952 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
1953                                   struct btrfs_root *root,
1954                                   u64 num_bytes, u64 min_alloc_size,
1955                                   u64 empty_size, u64 hint_byte,
1956                                   u64 search_end, struct btrfs_key *ins,
1957                                   u64 data)
1958 {
1959         int ret;
1960         u64 search_start = 0;
1961         u64 alloc_profile;
1962         struct btrfs_fs_info *info = root->fs_info;
1963
1964         if (data) {
1965                 alloc_profile = info->avail_data_alloc_bits &
1966                                 info->data_alloc_profile;
1967                 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
1968         } else if (root == root->fs_info->chunk_root) {
1969                 alloc_profile = info->avail_system_alloc_bits &
1970                                 info->system_alloc_profile;
1971                 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
1972         } else {
1973                 alloc_profile = info->avail_metadata_alloc_bits &
1974                                 info->metadata_alloc_profile;
1975                 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
1976         }
1977 again:
1978         data = reduce_alloc_profile(root, data);
1979         /*
1980          * the only place that sets empty_size is btrfs_realloc_node, which
1981          * is not called recursively on allocations
1982          */
1983         if (empty_size || root->ref_cows) {
1984                 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
1985                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1986                                      2 * 1024 * 1024,
1987                                      BTRFS_BLOCK_GROUP_METADATA |
1988                                      (info->metadata_alloc_profile &
1989                                       info->avail_metadata_alloc_bits), 0);
1990                         BUG_ON(ret);
1991                 }
1992                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1993                                      num_bytes + 2 * 1024 * 1024, data, 0);
1994                 BUG_ON(ret);
1995         }
1996
1997         WARN_ON(num_bytes < root->sectorsize);
1998         ret = find_free_extent(trans, root, num_bytes, empty_size,
1999                                search_start, search_end, hint_byte, ins,
2000                                trans->alloc_exclude_start,
2001                                trans->alloc_exclude_nr, data);
2002
2003         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
2004                 num_bytes = num_bytes >> 1;
2005                 num_bytes = max(num_bytes, min_alloc_size);
2006                 do_chunk_alloc(trans, root->fs_info->extent_root,
2007                                num_bytes, data, 1);
2008                 goto again;
2009         }
2010         if (ret) {
2011                 printk("allocation failed flags %Lu\n", data);
2012                 BUG();
2013         }
2014         clear_extent_dirty(&root->fs_info->free_space_cache,
2015                            ins->objectid, ins->objectid + ins->offset - 1,
2016                            GFP_NOFS);
2017         return 0;
2018 }
2019
2020 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2021                                   struct btrfs_root *root,
2022                                   u64 num_bytes, u64 min_alloc_size,
2023                                   u64 empty_size, u64 hint_byte,
2024                                   u64 search_end, struct btrfs_key *ins,
2025                                   u64 data)
2026 {
2027         int ret;
2028         maybe_lock_mutex(root);
2029         ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
2030                                      empty_size, hint_byte, search_end, ins,
2031                                      data);
2032         maybe_unlock_mutex(root);
2033         return ret;
2034 }
2035
2036 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
2037                                          struct btrfs_root *root,
2038                                          u64 root_objectid, u64 ref_generation,
2039                                          u64 owner, u64 owner_offset,
2040                                          struct btrfs_key *ins)
2041 {
2042         int ret;
2043         int pending_ret;
2044         u64 super_used;
2045         u64 root_used;
2046         u64 num_bytes = ins->offset;
2047         u32 sizes[2];
2048         struct btrfs_fs_info *info = root->fs_info;
2049         struct btrfs_root *extent_root = info->extent_root;
2050         struct btrfs_extent_item *extent_item;
2051         struct btrfs_extent_ref *ref;
2052         struct btrfs_path *path;
2053         struct btrfs_key keys[2];
2054
2055         /* block accounting for super block */
2056         spin_lock_irq(&info->delalloc_lock);
2057         super_used = btrfs_super_bytes_used(&info->super_copy);
2058         btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
2059         spin_unlock_irq(&info->delalloc_lock);
2060
2061         /* block accounting for root item */
2062         root_used = btrfs_root_used(&root->root_item);
2063         btrfs_set_root_used(&root->root_item, root_used + num_bytes);
2064
2065         if (root == extent_root) {
2066                 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
2067                                 ins->objectid + ins->offset - 1,
2068                                 EXTENT_LOCKED, GFP_NOFS);
2069                 goto update_block;
2070         }
2071
2072         memcpy(&keys[0], ins, sizeof(*ins));
2073         keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
2074                                          owner, owner_offset);
2075         keys[1].objectid = ins->objectid;
2076         keys[1].type = BTRFS_EXTENT_REF_KEY;
2077         sizes[0] = sizeof(*extent_item);
2078         sizes[1] = sizeof(*ref);
2079
2080         path = btrfs_alloc_path();
2081         BUG_ON(!path);
2082
2083         ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
2084                                        sizes, 2);
2085
2086         BUG_ON(ret);
2087         extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2088                                      struct btrfs_extent_item);
2089         btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
2090         ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
2091                              struct btrfs_extent_ref);
2092
2093         btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
2094         btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
2095         btrfs_set_ref_objectid(path->nodes[0], ref, owner);
2096         btrfs_set_ref_offset(path->nodes[0], ref, owner_offset);
2097
2098         btrfs_mark_buffer_dirty(path->nodes[0]);
2099
2100         trans->alloc_exclude_start = 0;
2101         trans->alloc_exclude_nr = 0;
2102         btrfs_free_path(path);
2103         finish_current_insert(trans, extent_root);
2104         pending_ret = del_pending_extents(trans, extent_root);
2105
2106         if (ret)
2107                 goto out;
2108         if (pending_ret) {
2109                 ret = pending_ret;
2110                 goto out;
2111         }
2112
2113 update_block:
2114         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
2115         if (ret) {
2116                 printk("update block group failed for %Lu %Lu\n",
2117                        ins->objectid, ins->offset);
2118                 BUG();
2119         }
2120 out:
2121         return ret;
2122 }
2123
2124 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
2125                                 struct btrfs_root *root,
2126                                 u64 root_objectid, u64 ref_generation,
2127                                 u64 owner, u64 owner_offset,
2128                                 struct btrfs_key *ins)
2129 {
2130         int ret;
2131         maybe_lock_mutex(root);
2132         ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
2133                                             ref_generation, owner,
2134                                             owner_offset, ins);
2135         maybe_unlock_mutex(root);
2136         return ret;
2137 }
2138 /*
2139  * finds a free extent and does all the dirty work required for allocation
2140  * returns the key for the extent through ins, and a tree buffer for
2141  * the first block of the extent through buf.
2142  *
2143  * returns 0 if everything worked, non-zero otherwise.
2144  */
2145 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
2146                        struct btrfs_root *root,
2147                        u64 num_bytes, u64 min_alloc_size,
2148                        u64 root_objectid, u64 ref_generation,
2149                        u64 owner, u64 owner_offset,
2150                        u64 empty_size, u64 hint_byte,
2151                        u64 search_end, struct btrfs_key *ins, u64 data)
2152 {
2153         int ret;
2154
2155         maybe_lock_mutex(root);
2156
2157         ret = __btrfs_reserve_extent(trans, root, num_bytes,
2158                                      min_alloc_size, empty_size, hint_byte,
2159                                      search_end, ins, data);
2160         BUG_ON(ret);
2161         ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
2162                                             ref_generation, owner,
2163                                             owner_offset, ins);
2164         BUG_ON(ret);
2165
2166         maybe_unlock_mutex(root);
2167         return ret;
2168 }
2169 /*
2170  * helper function to allocate a block for a given tree
2171  * returns the tree buffer or NULL.
2172  */
2173 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2174                                              struct btrfs_root *root,
2175                                              u32 blocksize,
2176                                              u64 root_objectid,
2177                                              u64 ref_generation,
2178                                              u64 first_objectid,
2179                                              int level,
2180                                              u64 hint,
2181                                              u64 empty_size)
2182 {
2183         struct btrfs_key ins;
2184         int ret;
2185         struct extent_buffer *buf;
2186
2187         ret = btrfs_alloc_extent(trans, root, blocksize, blocksize,
2188                                  root_objectid, ref_generation,
2189                                  level, first_objectid, empty_size, hint,
2190                                  (u64)-1, &ins, 0);
2191         if (ret) {
2192                 BUG_ON(ret > 0);
2193                 return ERR_PTR(ret);
2194         }
2195         buf = btrfs_find_create_tree_block(root, ins.objectid, blocksize);
2196         if (!buf) {
2197                 btrfs_free_extent(trans, root, ins.objectid, blocksize,
2198                                   root->root_key.objectid, ref_generation,
2199                                   0, 0, 0);
2200                 return ERR_PTR(-ENOMEM);
2201         }
2202         btrfs_set_header_generation(buf, trans->transid);
2203         btrfs_tree_lock(buf);
2204         clean_tree_block(trans, root, buf);
2205         btrfs_set_buffer_uptodate(buf);
2206
2207         if (PageDirty(buf->first_page)) {
2208                 printk("page %lu dirty\n", buf->first_page->index);
2209                 WARN_ON(1);
2210         }
2211
2212         set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
2213                          buf->start + buf->len - 1, GFP_NOFS);
2214         trans->blocks_used++;
2215         return buf;
2216 }
2217
2218 static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
2219                                   struct btrfs_root *root,
2220                                   struct extent_buffer *leaf)
2221 {
2222         u64 leaf_owner;
2223         u64 leaf_generation;
2224         struct btrfs_key key;
2225         struct btrfs_file_extent_item *fi;
2226         int i;
2227         int nritems;
2228         int ret;
2229
2230         BUG_ON(!btrfs_is_leaf(leaf));
2231         nritems = btrfs_header_nritems(leaf);
2232         leaf_owner = btrfs_header_owner(leaf);
2233         leaf_generation = btrfs_header_generation(leaf);
2234
2235         mutex_unlock(&root->fs_info->alloc_mutex);
2236
2237         for (i = 0; i < nritems; i++) {
2238                 u64 disk_bytenr;
2239                 cond_resched();
2240
2241                 btrfs_item_key_to_cpu(leaf, &key, i);
2242                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2243                         continue;
2244                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2245                 if (btrfs_file_extent_type(leaf, fi) ==
2246                     BTRFS_FILE_EXTENT_INLINE)
2247                         continue;
2248                 /*
2249                  * FIXME make sure to insert a trans record that
2250                  * repeats the snapshot del on crash
2251                  */
2252                 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2253                 if (disk_bytenr == 0)
2254                         continue;
2255
2256                 mutex_lock(&root->fs_info->alloc_mutex);
2257                 ret = __btrfs_free_extent(trans, root, disk_bytenr,
2258                                 btrfs_file_extent_disk_num_bytes(leaf, fi),
2259                                 leaf_owner, leaf_generation,
2260                                 key.objectid, key.offset, 0);
2261                 mutex_unlock(&root->fs_info->alloc_mutex);
2262                 BUG_ON(ret);
2263         }
2264
2265         mutex_lock(&root->fs_info->alloc_mutex);
2266         return 0;
2267 }
2268
2269 static void noinline reada_walk_down(struct btrfs_root *root,
2270                                      struct extent_buffer *node,
2271                                      int slot)
2272 {
2273         u64 bytenr;
2274         u64 last = 0;
2275         u32 nritems;
2276         u32 refs;
2277         u32 blocksize;
2278         int ret;
2279         int i;
2280         int level;
2281         int skipped = 0;
2282
2283         nritems = btrfs_header_nritems(node);
2284         level = btrfs_header_level(node);
2285         if (level)
2286                 return;
2287
2288         for (i = slot; i < nritems && skipped < 32; i++) {
2289                 bytenr = btrfs_node_blockptr(node, i);
2290                 if (last && ((bytenr > last && bytenr - last > 32 * 1024) ||
2291                              (last > bytenr && last - bytenr > 32 * 1024))) {
2292                         skipped++;
2293                         continue;
2294                 }
2295                 blocksize = btrfs_level_size(root, level - 1);
2296                 if (i != slot) {
2297                         ret = lookup_extent_ref(NULL, root, bytenr,
2298                                                 blocksize, &refs);
2299                         BUG_ON(ret);
2300                         if (refs != 1) {
2301                                 skipped++;
2302                                 continue;
2303                         }
2304                 }
2305                 ret = readahead_tree_block(root, bytenr, blocksize,
2306                                            btrfs_node_ptr_generation(node, i));
2307                 last = bytenr + blocksize;
2308                 cond_resched();
2309                 if (ret)
2310                         break;
2311         }
2312 }
2313
2314 /*
2315  * we want to avoid as much random IO as we can with the alloc mutex
2316  * held, so drop the lock and do the lookup, then do it again with the
2317  * lock held.
2318  */
2319 int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
2320                               u32 *refs)
2321 {
2322         mutex_unlock(&root->fs_info->alloc_mutex);
2323         lookup_extent_ref(NULL, root, start, len, refs);
2324         cond_resched();
2325         mutex_lock(&root->fs_info->alloc_mutex);
2326         return lookup_extent_ref(NULL, root, start, len, refs);
2327 }
2328
2329 /*
2330  * helper function for drop_snapshot, this walks down the tree dropping ref
2331  * counts as it goes.
2332  */
2333 static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
2334                                    struct btrfs_root *root,
2335                                    struct btrfs_path *path, int *level)
2336 {
2337         u64 root_owner;
2338         u64 root_gen;
2339         u64 bytenr;
2340         u64 ptr_gen;
2341         struct extent_buffer *next;
2342         struct extent_buffer *cur;
2343         struct extent_buffer *parent;
2344         u32 blocksize;
2345         int ret;
2346         u32 refs;
2347
2348         mutex_lock(&root->fs_info->alloc_mutex);
2349
2350         WARN_ON(*level < 0);
2351         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2352         ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
2353                                 path->nodes[*level]->len, &refs);
2354         BUG_ON(ret);
2355         if (refs > 1)
2356                 goto out;
2357
2358         /*
2359          * walk down to the last node level and free all the leaves
2360          */
2361         while(*level >= 0) {
2362                 WARN_ON(*level < 0);
2363                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2364                 cur = path->nodes[*level];
2365
2366                 if (btrfs_header_level(cur) != *level)
2367                         WARN_ON(1);
2368
2369                 if (path->slots[*level] >=
2370                     btrfs_header_nritems(cur))
2371                         break;
2372                 if (*level == 0) {
2373                         ret = drop_leaf_ref(trans, root, cur);
2374                         BUG_ON(ret);
2375                         break;
2376                 }
2377                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2378                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2379                 blocksize = btrfs_level_size(root, *level - 1);
2380
2381                 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
2382                 BUG_ON(ret);
2383                 if (refs != 1) {
2384                         parent = path->nodes[*level];
2385                         root_owner = btrfs_header_owner(parent);
2386                         root_gen = btrfs_header_generation(parent);
2387                         path->slots[*level]++;
2388                         ret = __btrfs_free_extent(trans, root, bytenr,
2389                                                 blocksize, root_owner,
2390                                                 root_gen, 0, 0, 1);
2391                         BUG_ON(ret);
2392                         continue;
2393                 }
2394                 next = btrfs_find_tree_block(root, bytenr, blocksize);
2395                 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
2396                         free_extent_buffer(next);
2397                         mutex_unlock(&root->fs_info->alloc_mutex);
2398
2399                         if (path->slots[*level] == 0)
2400                                 reada_walk_down(root, cur, path->slots[*level]);
2401
2402                         next = read_tree_block(root, bytenr, blocksize,
2403                                                ptr_gen);
2404                         cond_resched();
2405                         mutex_lock(&root->fs_info->alloc_mutex);
2406
2407                         /* we've dropped the lock, double check */
2408                         ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
2409                                                 &refs);
2410                         BUG_ON(ret);
2411                         if (refs != 1) {
2412                                 parent = path->nodes[*level];
2413                                 root_owner = btrfs_header_owner(parent);
2414                                 root_gen = btrfs_header_generation(parent);
2415
2416                                 path->slots[*level]++;
2417                                 free_extent_buffer(next);
2418                                 ret = __btrfs_free_extent(trans, root, bytenr,
2419                                                         blocksize,
2420                                                         root_owner,
2421                                                         root_gen, 0, 0, 1);
2422                                 BUG_ON(ret);
2423                                 continue;
2424                         }
2425                 }
2426                 WARN_ON(*level <= 0);
2427                 if (path->nodes[*level-1])
2428                         free_extent_buffer(path->nodes[*level-1]);
2429                 path->nodes[*level-1] = next;
2430                 *level = btrfs_header_level(next);
2431                 path->slots[*level] = 0;
2432         }
2433 out:
2434         WARN_ON(*level < 0);
2435         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2436
2437         if (path->nodes[*level] == root->node) {
2438                 root_owner = root->root_key.objectid;
2439                 parent = path->nodes[*level];
2440         } else {
2441                 parent = path->nodes[*level + 1];
2442                 root_owner = btrfs_header_owner(parent);
2443         }
2444
2445         root_gen = btrfs_header_generation(parent);
2446         ret = __btrfs_free_extent(trans, root, path->nodes[*level]->start,
2447                                 path->nodes[*level]->len,
2448                                 root_owner, root_gen, 0, 0, 1);
2449         free_extent_buffer(path->nodes[*level]);
2450         path->nodes[*level] = NULL;
2451         *level += 1;
2452         BUG_ON(ret);
2453         mutex_unlock(&root->fs_info->alloc_mutex);
2454         cond_resched();
2455         return 0;
2456 }
2457
2458 /*
2459  * helper for dropping snapshots.  This walks back up the tree in the path
2460  * to find the first node higher up where we haven't yet gone through
2461  * all the slots
2462  */
2463 static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
2464                                  struct btrfs_root *root,
2465                                  struct btrfs_path *path, int *level)
2466 {
2467         u64 root_owner;
2468         u64 root_gen;
2469         struct btrfs_root_item *root_item = &root->root_item;
2470         int i;
2471         int slot;
2472         int ret;
2473
2474         for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2475                 slot = path->slots[i];
2476                 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
2477                         struct extent_buffer *node;
2478                         struct btrfs_disk_key disk_key;
2479                         node = path->nodes[i];
2480                         path->slots[i]++;
2481                         *level = i;
2482                         WARN_ON(*level == 0);
2483                         btrfs_node_key(node, &disk_key, path->slots[i]);
2484                         memcpy(&root_item->drop_progress,
2485                                &disk_key, sizeof(disk_key));
2486                         root_item->drop_level = i;
2487                         return 0;
2488                 } else {
2489                         if (path->nodes[*level] == root->node) {
2490                                 root_owner = root->root_key.objectid;
2491                                 root_gen =
2492                                    btrfs_header_generation(path->nodes[*level]);
2493                         } else {
2494                                 struct extent_buffer *node;
2495                                 node = path->nodes[*level + 1];
2496                                 root_owner = btrfs_header_owner(node);
2497                                 root_gen = btrfs_header_generation(node);
2498                         }
2499                         ret = btrfs_free_extent(trans, root,
2500                                                 path->nodes[*level]->start,
2501                                                 path->nodes[*level]->len,
2502                                                 root_owner, root_gen, 0, 0, 1);
2503                         BUG_ON(ret);
2504                         free_extent_buffer(path->nodes[*level]);
2505                         path->nodes[*level] = NULL;
2506                         *level = i + 1;
2507                 }
2508         }
2509         return 1;
2510 }
2511
2512 /*
2513  * drop the reference count on the tree rooted at 'snap'.  This traverses
2514  * the tree freeing any blocks that have a ref count of zero after being
2515  * decremented.
2516  */
2517 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
2518                         *root)
2519 {
2520         int ret = 0;
2521         int wret;
2522         int level;
2523         struct btrfs_path *path;
2524         int i;
2525         int orig_level;
2526         struct btrfs_root_item *root_item = &root->root_item;
2527
2528         WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
2529         path = btrfs_alloc_path();
2530         BUG_ON(!path);
2531
2532         level = btrfs_header_level(root->node);
2533         orig_level = level;
2534         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2535                 path->nodes[level] = root->node;
2536                 extent_buffer_get(root->node);
2537                 path->slots[level] = 0;
2538         } else {
2539                 struct btrfs_key key;
2540                 struct btrfs_disk_key found_key;
2541                 struct extent_buffer *node;
2542
2543                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2544                 level = root_item->drop_level;
2545                 path->lowest_level = level;
2546                 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2547                 if (wret < 0) {
2548                         ret = wret;
2549                         goto out;
2550                 }
2551                 node = path->nodes[level];
2552                 btrfs_node_key(node, &found_key, path->slots[level]);
2553                 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
2554                                sizeof(found_key)));
2555                 /*
2556                  * unlock our path, this is safe because only this
2557                  * function is allowed to delete this snapshot
2558                  */
2559                 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
2560                         if (path->nodes[i] && path->locks[i]) {
2561                                 path->locks[i] = 0;
2562                                 btrfs_tree_unlock(path->nodes[i]);
2563                         }
2564                 }
2565         }
2566         while(1) {
2567                 wret = walk_down_tree(trans, root, path, &level);
2568                 if (wret > 0)
2569                         break;
2570                 if (wret < 0)
2571                         ret = wret;
2572
2573                 wret = walk_up_tree(trans, root, path, &level);
2574                 if (wret > 0)
2575                         break;
2576                 if (wret < 0)
2577                         ret = wret;
2578                 if (trans->transaction->in_commit) {
2579                         ret = -EAGAIN;
2580                         break;
2581                 }
2582         }
2583         for (i = 0; i <= orig_level; i++) {
2584                 if (path->nodes[i]) {
2585                         free_extent_buffer(path->nodes[i]);
2586                         path->nodes[i] = NULL;
2587                 }
2588         }
2589 out:
2590         btrfs_free_path(path);
2591         return ret;
2592 }
2593
2594 int btrfs_free_block_groups(struct btrfs_fs_info *info)
2595 {
2596         u64 start;
2597         u64 end;
2598         u64 ptr;
2599         int ret;
2600
2601         mutex_lock(&info->alloc_mutex);
2602         while(1) {
2603                 ret = find_first_extent_bit(&info->block_group_cache, 0,
2604                                             &start, &end, (unsigned int)-1);
2605                 if (ret)
2606                         break;
2607                 ret = get_state_private(&info->block_group_cache, start, &ptr);
2608                 if (!ret)
2609                         kfree((void *)(unsigned long)ptr);
2610                 clear_extent_bits(&info->block_group_cache, start,
2611                                   end, (unsigned int)-1, GFP_NOFS);
2612         }
2613         while(1) {
2614                 ret = find_first_extent_bit(&info->free_space_cache, 0,
2615                                             &start, &end, EXTENT_DIRTY);
2616                 if (ret)
2617                         break;
2618                 clear_extent_dirty(&info->free_space_cache, start,
2619                                    end, GFP_NOFS);
2620         }
2621         mutex_unlock(&info->alloc_mutex);
2622         return 0;
2623 }
2624
2625 static unsigned long calc_ra(unsigned long start, unsigned long last,
2626                              unsigned long nr)
2627 {
2628         return min(last, start + nr - 1);
2629 }
2630
2631 static int noinline relocate_inode_pages(struct inode *inode, u64 start,
2632                                          u64 len)
2633 {
2634         u64 page_start;
2635         u64 page_end;
2636         unsigned long last_index;
2637         unsigned long i;
2638         struct page *page;
2639         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2640         struct file_ra_state *ra;
2641         unsigned long total_read = 0;
2642         unsigned long ra_pages;
2643         struct btrfs_trans_handle *trans;
2644
2645         ra = kzalloc(sizeof(*ra), GFP_NOFS);
2646
2647         mutex_lock(&inode->i_mutex);
2648         i = start >> PAGE_CACHE_SHIFT;
2649         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
2650
2651         ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages;
2652
2653         file_ra_state_init(ra, inode->i_mapping);
2654
2655         for (; i <= last_index; i++) {
2656                 if (total_read % ra_pages == 0) {
2657                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
2658                                        calc_ra(i, last_index, ra_pages));
2659                 }
2660                 total_read++;
2661                 if (((u64)i << PAGE_CACHE_SHIFT) > inode->i_size)
2662                         goto truncate_racing;
2663
2664                 page = grab_cache_page(inode->i_mapping, i);
2665                 if (!page) {
2666                         goto out_unlock;
2667                 }
2668                 if (!PageUptodate(page)) {
2669                         btrfs_readpage(NULL, page);
2670                         lock_page(page);
2671                         if (!PageUptodate(page)) {
2672                                 unlock_page(page);
2673                                 page_cache_release(page);
2674                                 goto out_unlock;
2675                         }
2676                 }
2677 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
2678                 ClearPageDirty(page);
2679 #else
2680                 cancel_dirty_page(page, PAGE_CACHE_SIZE);
2681 #endif
2682                 wait_on_page_writeback(page);
2683                 set_page_extent_mapped(page);
2684                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2685                 page_end = page_start + PAGE_CACHE_SIZE - 1;
2686
2687                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2688
2689                 set_extent_delalloc(io_tree, page_start,
2690                                     page_end, GFP_NOFS);
2691                 set_page_dirty(page);
2692
2693                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2694                 unlock_page(page);
2695                 page_cache_release(page);
2696         }
2697         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2698                                            total_read);
2699
2700 out_unlock:
2701         kfree(ra);
2702         trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
2703         if (trans) {
2704                 btrfs_end_transaction(trans, BTRFS_I(inode)->root);
2705                 mark_inode_dirty(inode);
2706         }
2707         mutex_unlock(&inode->i_mutex);
2708         return 0;
2709
2710 truncate_racing:
2711         vmtruncate(inode, inode->i_size);
2712         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2713                                            total_read);
2714         goto out_unlock;
2715 }
2716
2717 /*
2718  * The back references tell us which tree holds a ref on a block,
2719  * but it is possible for the tree root field in the reference to
2720  * reflect the original root before a snapshot was made.  In this
2721  * case we should search through all the children of a given root
2722  * to find potential holders of references on a block.
2723  *
2724  * Instead, we do something a little less fancy and just search
2725  * all the roots for a given key/block combination.
2726  */
2727 static int find_root_for_ref(struct btrfs_root *root,
2728                              struct btrfs_path *path,
2729                              struct btrfs_key *key0,
2730                              int level,
2731                              int file_key,
2732                              struct btrfs_root **found_root,
2733                              u64 bytenr)
2734 {
2735         struct btrfs_key root_location;
2736         struct btrfs_root *cur_root = *found_root;
2737         struct btrfs_file_extent_item *file_extent;
2738         u64 root_search_start = BTRFS_FS_TREE_OBJECTID;
2739         u64 found_bytenr;
2740         int ret;
2741
2742         root_location.offset = (u64)-1;
2743         root_location.type = BTRFS_ROOT_ITEM_KEY;
2744         path->lowest_level = level;
2745         path->reada = 0;
2746         while(1) {
2747                 ret = btrfs_search_slot(NULL, cur_root, key0, path, 0, 0);
2748                 found_bytenr = 0;
2749                 if (ret == 0 && file_key) {
2750                         struct extent_buffer *leaf = path->nodes[0];
2751                         file_extent = btrfs_item_ptr(leaf, path->slots[0],
2752                                              struct btrfs_file_extent_item);
2753                         if (btrfs_file_extent_type(leaf, file_extent) ==
2754                             BTRFS_FILE_EXTENT_REG) {
2755                                 found_bytenr =
2756                                         btrfs_file_extent_disk_bytenr(leaf,
2757                                                                file_extent);
2758                        }
2759                 } else if (!file_key) {
2760                         if (path->nodes[level])
2761                                 found_bytenr = path->nodes[level]->start;
2762                 }
2763
2764                 btrfs_release_path(cur_root, path);
2765
2766                 if (found_bytenr == bytenr) {
2767                         *found_root = cur_root;
2768                         ret = 0;
2769                         goto out;
2770                 }
2771                 ret = btrfs_search_root(root->fs_info->tree_root,
2772                                         root_search_start, &root_search_start);
2773                 if (ret)
2774                         break;
2775
2776                 root_location.objectid = root_search_start;
2777                 cur_root = btrfs_read_fs_root_no_name(root->fs_info,
2778                                                       &root_location);
2779                 if (!cur_root) {
2780                         ret = 1;
2781                         break;
2782                 }
2783         }
2784 out:
2785         path->lowest_level = 0;
2786         return ret;
2787 }
2788
2789 /*
2790  * note, this releases the path
2791  */
2792 static int noinline relocate_one_reference(struct btrfs_root *extent_root,
2793                                   struct btrfs_path *path,
2794                                   struct btrfs_key *extent_key,
2795                                   u64 *last_file_objectid,
2796                                   u64 *last_file_offset,
2797                                   u64 *last_file_root,
2798                                   u64 last_extent)
2799 {
2800         struct inode *inode;
2801         struct btrfs_root *found_root;
2802         struct btrfs_key root_location;
2803         struct btrfs_key found_key;
2804         struct btrfs_extent_ref *ref;
2805         u64 ref_root;
2806         u64 ref_gen;
2807         u64 ref_objectid;
2808         u64 ref_offset;
2809         int ret;
2810         int level;
2811
2812         WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
2813
2814         ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
2815                              struct btrfs_extent_ref);
2816         ref_root = btrfs_ref_root(path->nodes[0], ref);
2817         ref_gen = btrfs_ref_generation(path->nodes[0], ref);
2818         ref_objectid = btrfs_ref_objectid(path->nodes[0], ref);
2819         ref_offset = btrfs_ref_offset(path->nodes[0], ref);
2820         btrfs_release_path(extent_root, path);
2821
2822         root_location.objectid = ref_root;
2823         if (ref_gen == 0)
2824                 root_location.offset = 0;
2825         else
2826                 root_location.offset = (u64)-1;
2827         root_location.type = BTRFS_ROOT_ITEM_KEY;
2828
2829         found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
2830                                                 &root_location);
2831         BUG_ON(!found_root);
2832         mutex_unlock(&extent_root->fs_info->alloc_mutex);
2833
2834         if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2835                 found_key.objectid = ref_objectid;
2836                 found_key.type = BTRFS_EXTENT_DATA_KEY;
2837                 found_key.offset = ref_offset;
2838                 level = 0;
2839
2840                 if (last_extent == extent_key->objectid &&
2841                     *last_file_objectid == ref_objectid &&
2842                     *last_file_offset == ref_offset &&
2843                     *last_file_root == ref_root)
2844                         goto out;
2845
2846                 ret = find_root_for_ref(extent_root, path, &found_key,
2847                                         level, 1, &found_root,
2848                                         extent_key->objectid);
2849
2850                 if (ret)
2851                         goto out;
2852
2853                 if (last_extent == extent_key->objectid &&
2854                     *last_file_objectid == ref_objectid &&
2855                     *last_file_offset == ref_offset &&
2856                     *last_file_root == ref_root)
2857                         goto out;
2858
2859                 inode = btrfs_iget_locked(extent_root->fs_info->sb,
2860                                           ref_objectid, found_root);
2861                 if (inode->i_state & I_NEW) {
2862                         /* the inode and parent dir are two different roots */
2863                         BTRFS_I(inode)->root = found_root;
2864                         BTRFS_I(inode)->location.objectid = ref_objectid;
2865                         BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
2866                         BTRFS_I(inode)->location.offset = 0;
2867                         btrfs_read_locked_inode(inode);
2868                         unlock_new_inode(inode);
2869
2870                 }
2871                 /* this can happen if the reference is not against
2872                  * the latest version of the tree root
2873                  */
2874                 if (is_bad_inode(inode))
2875                         goto out;
2876
2877                 *last_file_objectid = inode->i_ino;
2878                 *last_file_root = found_root->root_key.objectid;
2879                 *last_file_offset = ref_offset;
2880
2881                 relocate_inode_pages(inode, ref_offset, extent_key->offset);
2882                 iput(inode);
2883         } else {
2884                 struct btrfs_trans_handle *trans;
2885                 struct extent_buffer *eb;
2886                 int needs_lock = 0;
2887
2888                 eb = read_tree_block(found_root, extent_key->objectid,
2889                                      extent_key->offset, 0);
2890                 btrfs_tree_lock(eb);
2891                 level = btrfs_header_level(eb);
2892
2893                 if (level == 0)
2894                         btrfs_item_key_to_cpu(eb, &found_key, 0);
2895                 else
2896                         btrfs_node_key_to_cpu(eb, &found_key, 0);
2897
2898                 btrfs_tree_unlock(eb);
2899                 free_extent_buffer(eb);
2900
2901                 ret = find_root_for_ref(extent_root, path, &found_key,
2902                                         level, 0, &found_root,
2903                                         extent_key->objectid);
2904
2905                 if (ret)
2906                         goto out;
2907
2908                 /*
2909                  * right here almost anything could happen to our key,
2910                  * but that's ok.  The cow below will either relocate it
2911                  * or someone else will have relocated it.  Either way,
2912                  * it is in a different spot than it was before and
2913                  * we're happy.
2914                  */
2915
2916                 trans = btrfs_start_transaction(found_root, 1);
2917
2918                 if (found_root == extent_root->fs_info->extent_root ||
2919                     found_root == extent_root->fs_info->chunk_root ||
2920                     found_root == extent_root->fs_info->dev_root) {
2921                         needs_lock = 1;
2922                         mutex_lock(&extent_root->fs_info->alloc_mutex);
2923                 }
2924
2925                 path->lowest_level = level;
2926                 path->reada = 2;
2927                 ret = btrfs_search_slot(trans, found_root, &found_key, path,
2928                                         0, 1);
2929                 path->lowest_level = 0;
2930                 btrfs_release_path(found_root, path);
2931
2932                 if (found_root == found_root->fs_info->extent_root)
2933                         btrfs_extent_post_op(trans, found_root);
2934                 if (needs_lock)
2935                         mutex_unlock(&extent_root->fs_info->alloc_mutex);
2936
2937                 btrfs_end_transaction(trans, found_root);
2938
2939         }
2940 out:
2941         mutex_lock(&extent_root->fs_info->alloc_mutex);
2942         return 0;
2943 }
2944
2945 static int noinline del_extent_zero(struct btrfs_root *extent_root,
2946                                     struct btrfs_path *path,
2947                                     struct btrfs_key *extent_key)
2948 {
2949         int ret;
2950         struct btrfs_trans_handle *trans;
2951
2952         trans = btrfs_start_transaction(extent_root, 1);
2953         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
2954         if (ret > 0) {
2955                 ret = -EIO;
2956                 goto out;
2957         }
2958         if (ret < 0)
2959                 goto out;
2960         ret = btrfs_del_item(trans, extent_root, path);
2961 out:
2962         btrfs_end_transaction(trans, extent_root);
2963         return ret;
2964 }
2965
2966 static int noinline relocate_one_extent(struct btrfs_root *extent_root,
2967                                         struct btrfs_path *path,
2968                                         struct btrfs_key *extent_key)
2969 {
2970         struct btrfs_key key;
2971         struct btrfs_key found_key;
2972         struct extent_buffer *leaf;
2973         u64 last_file_objectid = 0;
2974         u64 last_file_root = 0;
2975         u64 last_file_offset = (u64)-1;
2976         u64 last_extent = 0;
2977         u32 nritems;
2978         u32 item_size;
2979         int ret = 0;
2980
2981         if (extent_key->objectid == 0) {
2982                 ret = del_extent_zero(extent_root, path, extent_key);
2983                 goto out;
2984         }
2985         key.objectid = extent_key->objectid;
2986         key.type = BTRFS_EXTENT_REF_KEY;
2987         key.offset = 0;
2988
2989         while(1) {
2990                 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2991
2992                 if (ret < 0)
2993                         goto out;
2994
2995                 ret = 0;
2996                 leaf = path->nodes[0];
2997                 nritems = btrfs_header_nritems(leaf);
2998                 if (path->slots[0] == nritems) {
2999                         ret = btrfs_next_leaf(extent_root, path);
3000                         if (ret > 0) {
3001                                 ret = 0;
3002                                 goto out;
3003                         }
3004                         if (ret < 0)
3005                                 goto out;
3006                         leaf = path->nodes[0];
3007                 }
3008
3009                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3010                 if (found_key.objectid != extent_key->objectid) {
3011                         break;
3012                 }
3013
3014                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
3015                         break;
3016                 }
3017
3018                 key.offset = found_key.offset + 1;
3019                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3020
3021                 ret = relocate_one_reference(extent_root, path, extent_key,
3022                                              &last_file_objectid,
3023                                              &last_file_offset,
3024                                              &last_file_root, last_extent);
3025                 if (ret)
3026                         goto out;
3027                 last_extent = extent_key->objectid;
3028         }
3029         ret = 0;
3030 out:
3031         btrfs_release_path(extent_root, path);
3032         return ret;
3033 }
3034
3035 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
3036 {
3037         u64 num_devices;
3038         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
3039                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
3040
3041         num_devices = root->fs_info->fs_devices->num_devices;
3042         if (num_devices == 1) {
3043                 stripped |= BTRFS_BLOCK_GROUP_DUP;
3044                 stripped = flags & ~stripped;
3045
3046                 /* turn raid0 into single device chunks */
3047                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
3048                         return stripped;
3049
3050                 /* turn mirroring into duplication */
3051                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3052                              BTRFS_BLOCK_GROUP_RAID10))
3053                         return stripped | BTRFS_BLOCK_GROUP_DUP;
3054                 return flags;
3055         } else {
3056                 /* they already had raid on here, just return */
3057                 if (flags & stripped)
3058                         return flags;
3059
3060                 stripped |= BTRFS_BLOCK_GROUP_DUP;
3061                 stripped = flags & ~stripped;
3062
3063                 /* switch duplicated blocks with raid1 */
3064                 if (flags & BTRFS_BLOCK_GROUP_DUP)
3065                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
3066
3067                 /* turn single device chunks into raid0 */
3068                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
3069         }
3070         return flags;
3071 }
3072
3073 int __alloc_chunk_for_shrink(struct btrfs_root *root,
3074                      struct btrfs_block_group_cache *shrink_block_group,
3075                      int force)
3076 {
3077         struct btrfs_trans_handle *trans;
3078         u64 new_alloc_flags;
3079         u64 calc;
3080
3081         spin_lock(&shrink_block_group->lock);
3082         if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
3083                 spin_unlock(&shrink_block_group->lock);
3084                 mutex_unlock(&root->fs_info->alloc_mutex);
3085
3086                 trans = btrfs_start_transaction(root, 1);
3087                 mutex_lock(&root->fs_info->alloc_mutex);
3088                 spin_lock(&shrink_block_group->lock);
3089
3090                 new_alloc_flags = update_block_group_flags(root,
3091                                                    shrink_block_group->flags);
3092                 if (new_alloc_flags != shrink_block_group->flags) {
3093                         calc =
3094                              btrfs_block_group_used(&shrink_block_group->item);
3095                 } else {
3096                         calc = shrink_block_group->key.offset;
3097                 }
3098                 spin_unlock(&shrink_block_group->lock);
3099
3100                 do_chunk_alloc(trans, root->fs_info->extent_root,
3101                                calc + 2 * 1024 * 1024, new_alloc_flags, force);
3102
3103                 mutex_unlock(&root->fs_info->alloc_mutex);
3104                 btrfs_end_transaction(trans, root);
3105                 mutex_lock(&root->fs_info->alloc_mutex);
3106         } else
3107                 spin_unlock(&shrink_block_group->lock);
3108         return 0;
3109 }
3110
3111 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
3112 {
3113         struct btrfs_trans_handle *trans;
3114         struct btrfs_root *tree_root = root->fs_info->tree_root;
3115         struct btrfs_path *path;
3116         u64 cur_byte;
3117         u64 total_found;
3118         u64 shrink_last_byte;
3119         struct btrfs_block_group_cache *shrink_block_group;
3120         struct btrfs_fs_info *info = root->fs_info;
3121         struct btrfs_key key;
3122         struct btrfs_key found_key;
3123         struct extent_buffer *leaf;
3124         u32 nritems;
3125         int ret;
3126         int progress;
3127
3128         mutex_lock(&root->fs_info->alloc_mutex);
3129         shrink_block_group = btrfs_lookup_block_group(root->fs_info,
3130                                                       shrink_start);
3131         BUG_ON(!shrink_block_group);
3132
3133         shrink_last_byte = shrink_block_group->key.objectid +
3134                 shrink_block_group->key.offset;
3135
3136         shrink_block_group->space_info->total_bytes -=
3137                 shrink_block_group->key.offset;
3138         path = btrfs_alloc_path();
3139         root = root->fs_info->extent_root;
3140         path->reada = 2;
3141
3142         printk("btrfs relocating block group %llu flags %llu\n",
3143                (unsigned long long)shrink_start,
3144                (unsigned long long)shrink_block_group->flags);
3145
3146         __alloc_chunk_for_shrink(root, shrink_block_group, 1);
3147
3148 again:
3149
3150         shrink_block_group->ro = 1;
3151
3152         total_found = 0;
3153         progress = 0;
3154         key.objectid = shrink_start;
3155         key.offset = 0;
3156         key.type = 0;
3157         cur_byte = key.objectid;
3158
3159         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3160         if (ret < 0)
3161                 goto out;
3162
3163         ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
3164         if (ret < 0)
3165                 goto out;
3166
3167         if (ret == 0) {
3168                 leaf = path->nodes[0];
3169                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3170                 if (found_key.objectid + found_key.offset > shrink_start &&
3171                     found_key.objectid < shrink_last_byte) {
3172                         cur_byte = found_key.objectid;
3173                         key.objectid = cur_byte;
3174                 }
3175         }
3176         btrfs_release_path(root, path);
3177
3178         while(1) {
3179                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3180                 if (ret < 0)
3181                         goto out;
3182
3183 next:
3184                 leaf = path->nodes[0];
3185                 nritems = btrfs_header_nritems(leaf);
3186                 if (path->slots[0] >= nritems) {
3187                         ret = btrfs_next_leaf(root, path);
3188                         if (ret < 0)
3189                                 goto out;
3190                         if (ret == 1) {
3191                                 ret = 0;
3192                                 break;
3193                         }
3194                         leaf = path->nodes[0];
3195                         nritems = btrfs_header_nritems(leaf);
3196                 }
3197
3198                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3199
3200                 if (found_key.objectid >= shrink_last_byte)
3201                         break;
3202
3203                 if (progress && need_resched()) {
3204                         memcpy(&key, &found_key, sizeof(key));
3205                         cond_resched();
3206                         btrfs_release_path(root, path);
3207                         btrfs_search_slot(NULL, root, &key, path, 0, 0);
3208                         progress = 0;
3209                         goto next;
3210                 }
3211                 progress = 1;
3212
3213                 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY ||
3214                     found_key.objectid + found_key.offset <= cur_byte) {
3215                         memcpy(&key, &found_key, sizeof(key));
3216                         key.offset++;
3217                         path->slots[0]++;
3218                         goto next;
3219                 }
3220
3221                 total_found++;
3222                 cur_byte = found_key.objectid + found_key.offset;
3223                 key.objectid = cur_byte;
3224                 btrfs_release_path(root, path);
3225                 ret = relocate_one_extent(root, path, &found_key);
3226                 __alloc_chunk_for_shrink(root, shrink_block_group, 0);
3227         }
3228
3229         btrfs_release_path(root, path);
3230
3231         if (total_found > 0) {
3232                 printk("btrfs relocate found %llu last extent was %llu\n",
3233                        (unsigned long long)total_found,
3234                        (unsigned long long)found_key.objectid);
3235                 mutex_unlock(&root->fs_info->alloc_mutex);
3236                 trans = btrfs_start_transaction(tree_root, 1);
3237                 btrfs_commit_transaction(trans, tree_root);
3238
3239                 btrfs_clean_old_snapshots(tree_root);
3240
3241                 trans = btrfs_start_transaction(tree_root, 1);
3242                 btrfs_commit_transaction(trans, tree_root);
3243                 mutex_lock(&root->fs_info->alloc_mutex);
3244                 goto again;
3245         }
3246
3247         /*
3248          * we've freed all the extents, now remove the block
3249          * group item from the tree
3250          */
3251         mutex_unlock(&root->fs_info->alloc_mutex);
3252
3253         trans = btrfs_start_transaction(root, 1);
3254
3255         mutex_lock(&root->fs_info->alloc_mutex);
3256         memcpy(&key, &shrink_block_group->key, sizeof(key));
3257
3258         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3259         if (ret > 0)
3260                 ret = -EIO;
3261         if (ret < 0)
3262                 goto out;
3263
3264         clear_extent_bits(&info->block_group_cache, key.objectid,
3265                           key.objectid + key.offset - 1,
3266                           (unsigned int)-1, GFP_NOFS);
3267
3268
3269         clear_extent_bits(&info->free_space_cache,
3270                            key.objectid, key.objectid + key.offset - 1,
3271                            (unsigned int)-1, GFP_NOFS);
3272
3273         memset(shrink_block_group, 0, sizeof(*shrink_block_group));
3274         kfree(shrink_block_group);
3275
3276         btrfs_del_item(trans, root, path);
3277         btrfs_release_path(root, path);
3278         mutex_unlock(&root->fs_info->alloc_mutex);
3279         btrfs_commit_transaction(trans, root);
3280
3281         mutex_lock(&root->fs_info->alloc_mutex);
3282
3283         /* the code to unpin extents might set a few bits in the free
3284          * space cache for this range again
3285          */
3286         clear_extent_bits(&info->free_space_cache,
3287                            key.objectid, key.objectid + key.offset - 1,
3288                            (unsigned int)-1, GFP_NOFS);
3289 out:
3290         btrfs_free_path(path);
3291         mutex_unlock(&root->fs_info->alloc_mutex);
3292         return ret;
3293 }
3294
3295 int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
3296                            struct btrfs_key *key)
3297 {
3298         int ret = 0;
3299         struct btrfs_key found_key;
3300         struct extent_buffer *leaf;
3301         int slot;
3302
3303         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
3304         if (ret < 0)
3305                 goto out;
3306
3307         while(1) {
3308                 slot = path->slots[0];
3309                 leaf = path->nodes[0];
3310                 if (slot >= btrfs_header_nritems(leaf)) {
3311                         ret = btrfs_next_leaf(root, path);
3312                         if (ret == 0)
3313                                 continue;
3314                         if (ret < 0)
3315                                 goto out;
3316                         break;
3317                 }
3318                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3319
3320                 if (found_key.objectid >= key->objectid &&
3321                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
3322                         ret = 0;
3323                         goto out;
3324                 }
3325                 path->slots[0]++;
3326         }
3327         ret = -ENOENT;
3328 out:
3329         return ret;
3330 }
3331
3332 int btrfs_read_block_groups(struct btrfs_root *root)
3333 {
3334         struct btrfs_path *path;
3335         int ret;
3336         int bit;
3337         struct btrfs_block_group_cache *cache;
3338         struct btrfs_fs_info *info = root->fs_info;
3339         struct btrfs_space_info *space_info;
3340         struct extent_io_tree *block_group_cache;
3341         struct btrfs_key key;
3342         struct btrfs_key found_key;
3343         struct extent_buffer *leaf;
3344
3345         block_group_cache = &info->block_group_cache;
3346         root = info->extent_root;
3347         key.objectid = 0;
3348         key.offset = 0;
3349         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
3350         path = btrfs_alloc_path();
3351         if (!path)
3352                 return -ENOMEM;
3353
3354         mutex_lock(&root->fs_info->alloc_mutex);
3355         while(1) {
3356                 ret = find_first_block_group(root, path, &key);
3357                 if (ret > 0) {
3358                         ret = 0;
3359                         goto error;
3360                 }
3361                 if (ret != 0)
3362                         goto error;
3363
3364                 leaf = path->nodes[0];
3365                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3366                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
3367                 if (!cache) {
3368                         ret = -ENOMEM;
3369                         break;
3370                 }
3371
3372                 spin_lock_init(&cache->lock);
3373                 read_extent_buffer(leaf, &cache->item,
3374                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
3375                                    sizeof(cache->item));
3376                 memcpy(&cache->key, &found_key, sizeof(found_key));
3377
3378                 key.objectid = found_key.objectid + found_key.offset;
3379                 btrfs_release_path(root, path);
3380                 cache->flags = btrfs_block_group_flags(&cache->item);
3381                 bit = 0;
3382                 if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
3383                         bit = BLOCK_GROUP_DATA;
3384                 } else if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
3385                         bit = BLOCK_GROUP_SYSTEM;
3386                 } else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
3387                         bit = BLOCK_GROUP_METADATA;
3388                 }
3389                 set_avail_alloc_bits(info, cache->flags);
3390
3391                 ret = update_space_info(info, cache->flags, found_key.offset,
3392                                         btrfs_block_group_used(&cache->item),
3393                                         &space_info);
3394                 BUG_ON(ret);
3395                 cache->space_info = space_info;
3396
3397                 /* use EXTENT_LOCKED to prevent merging */
3398                 set_extent_bits(block_group_cache, found_key.objectid,
3399                                 found_key.objectid + found_key.offset - 1,
3400                                 EXTENT_LOCKED, GFP_NOFS);
3401                 set_state_private(block_group_cache, found_key.objectid,
3402                                   (unsigned long)cache);
3403                 set_extent_bits(block_group_cache, found_key.objectid,
3404                                 found_key.objectid + found_key.offset - 1,
3405                                 bit | EXTENT_LOCKED, GFP_NOFS);
3406                 if (key.objectid >=
3407                     btrfs_super_total_bytes(&info->super_copy))
3408                         break;
3409         }
3410         ret = 0;
3411 error:
3412         btrfs_free_path(path);
3413         mutex_unlock(&root->fs_info->alloc_mutex);
3414         return ret;
3415 }
3416
3417 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3418                            struct btrfs_root *root, u64 bytes_used,
3419                            u64 type, u64 chunk_objectid, u64 chunk_offset,
3420                            u64 size)
3421 {
3422         int ret;
3423         int bit = 0;
3424         struct btrfs_root *extent_root;
3425         struct btrfs_block_group_cache *cache;
3426         struct extent_io_tree *block_group_cache;
3427
3428         WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
3429         extent_root = root->fs_info->extent_root;
3430         block_group_cache = &root->fs_info->block_group_cache;
3431
3432         cache = kzalloc(sizeof(*cache), GFP_NOFS);
3433         BUG_ON(!cache);
3434         cache->key.objectid = chunk_offset;
3435         cache->key.offset = size;
3436         spin_lock_init(&cache->lock);
3437         btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
3438
3439         btrfs_set_block_group_used(&cache->item, bytes_used);
3440         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
3441         cache->flags = type;
3442         btrfs_set_block_group_flags(&cache->item, type);
3443
3444         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
3445                                 &cache->space_info);
3446         BUG_ON(ret);
3447
3448         bit = block_group_state_bits(type);
3449         set_extent_bits(block_group_cache, chunk_offset,
3450                         chunk_offset + size - 1,
3451                         EXTENT_LOCKED, GFP_NOFS);
3452         set_state_private(block_group_cache, chunk_offset,
3453                           (unsigned long)cache);
3454         set_extent_bits(block_group_cache, chunk_offset,
3455                         chunk_offset + size - 1,
3456                         bit | EXTENT_LOCKED, GFP_NOFS);
3457
3458         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
3459                                 sizeof(cache->item));
3460         BUG_ON(ret);
3461
3462         finish_current_insert(trans, extent_root);
3463         ret = del_pending_extents(trans, extent_root);
3464         BUG_ON(ret);
3465         set_avail_alloc_bits(extent_root->fs_info, type);
3466
3467         return 0;
3468 }