f619c3cb13b7006cc7f95e273996917bbcebcf67
[pandora-kernel.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include "compat.h"
27 #include "hash.h"
28 #include "ctree.h"
29 #include "disk-io.h"
30 #include "print-tree.h"
31 #include "transaction.h"
32 #include "volumes.h"
33 #include "locking.h"
34 #include "free-space-cache.h"
35
36 static int update_block_group(struct btrfs_trans_handle *trans,
37                               struct btrfs_root *root,
38                               u64 bytenr, u64 num_bytes, int alloc);
39 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
40                                 struct btrfs_root *root,
41                                 u64 bytenr, u64 num_bytes, u64 parent,
42                                 u64 root_objectid, u64 owner_objectid,
43                                 u64 owner_offset, int refs_to_drop,
44                                 struct btrfs_delayed_extent_op *extra_op);
45 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
46                                     struct extent_buffer *leaf,
47                                     struct btrfs_extent_item *ei);
48 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
49                                       struct btrfs_root *root,
50                                       u64 parent, u64 root_objectid,
51                                       u64 flags, u64 owner, u64 offset,
52                                       struct btrfs_key *ins, int ref_mod);
53 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
54                                      struct btrfs_root *root,
55                                      u64 parent, u64 root_objectid,
56                                      u64 flags, struct btrfs_disk_key *key,
57                                      int level, struct btrfs_key *ins);
58 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
59                           struct btrfs_root *extent_root, u64 alloc_bytes,
60                           u64 flags, int force);
61 static int find_next_key(struct btrfs_path *path, int level,
62                          struct btrfs_key *key);
63 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
64                             int dump_block_groups);
65
66 static noinline int
67 block_group_cache_done(struct btrfs_block_group_cache *cache)
68 {
69         smp_mb();
70         return cache->cached == BTRFS_CACHE_FINISHED;
71 }
72
73 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
74 {
75         return (cache->flags & bits) == bits;
76 }
77
78 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
79 {
80         atomic_inc(&cache->count);
81 }
82
83 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
84 {
85         if (atomic_dec_and_test(&cache->count)) {
86                 WARN_ON(cache->pinned > 0);
87                 WARN_ON(cache->reserved > 0);
88                 WARN_ON(cache->reserved_pinned > 0);
89                 kfree(cache);
90         }
91 }
92
93 /*
94  * this adds the block group to the fs_info rb tree for the block group
95  * cache
96  */
97 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
98                                 struct btrfs_block_group_cache *block_group)
99 {
100         struct rb_node **p;
101         struct rb_node *parent = NULL;
102         struct btrfs_block_group_cache *cache;
103
104         spin_lock(&info->block_group_cache_lock);
105         p = &info->block_group_cache_tree.rb_node;
106
107         while (*p) {
108                 parent = *p;
109                 cache = rb_entry(parent, struct btrfs_block_group_cache,
110                                  cache_node);
111                 if (block_group->key.objectid < cache->key.objectid) {
112                         p = &(*p)->rb_left;
113                 } else if (block_group->key.objectid > cache->key.objectid) {
114                         p = &(*p)->rb_right;
115                 } else {
116                         spin_unlock(&info->block_group_cache_lock);
117                         return -EEXIST;
118                 }
119         }
120
121         rb_link_node(&block_group->cache_node, parent, p);
122         rb_insert_color(&block_group->cache_node,
123                         &info->block_group_cache_tree);
124         spin_unlock(&info->block_group_cache_lock);
125
126         return 0;
127 }
128
129 /*
130  * This will return the block group at or after bytenr if contains is 0, else
131  * it will return the block group that contains the bytenr
132  */
133 static struct btrfs_block_group_cache *
134 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
135                               int contains)
136 {
137         struct btrfs_block_group_cache *cache, *ret = NULL;
138         struct rb_node *n;
139         u64 end, start;
140
141         spin_lock(&info->block_group_cache_lock);
142         n = info->block_group_cache_tree.rb_node;
143
144         while (n) {
145                 cache = rb_entry(n, struct btrfs_block_group_cache,
146                                  cache_node);
147                 end = cache->key.objectid + cache->key.offset - 1;
148                 start = cache->key.objectid;
149
150                 if (bytenr < start) {
151                         if (!contains && (!ret || start < ret->key.objectid))
152                                 ret = cache;
153                         n = n->rb_left;
154                 } else if (bytenr > start) {
155                         if (contains && bytenr <= end) {
156                                 ret = cache;
157                                 break;
158                         }
159                         n = n->rb_right;
160                 } else {
161                         ret = cache;
162                         break;
163                 }
164         }
165         if (ret)
166                 btrfs_get_block_group(ret);
167         spin_unlock(&info->block_group_cache_lock);
168
169         return ret;
170 }
171
172 static int add_excluded_extent(struct btrfs_root *root,
173                                u64 start, u64 num_bytes)
174 {
175         u64 end = start + num_bytes - 1;
176         set_extent_bits(&root->fs_info->freed_extents[0],
177                         start, end, EXTENT_UPTODATE, GFP_NOFS);
178         set_extent_bits(&root->fs_info->freed_extents[1],
179                         start, end, EXTENT_UPTODATE, GFP_NOFS);
180         return 0;
181 }
182
183 static void free_excluded_extents(struct btrfs_root *root,
184                                   struct btrfs_block_group_cache *cache)
185 {
186         u64 start, end;
187
188         start = cache->key.objectid;
189         end = start + cache->key.offset - 1;
190
191         clear_extent_bits(&root->fs_info->freed_extents[0],
192                           start, end, EXTENT_UPTODATE, GFP_NOFS);
193         clear_extent_bits(&root->fs_info->freed_extents[1],
194                           start, end, EXTENT_UPTODATE, GFP_NOFS);
195 }
196
197 static int exclude_super_stripes(struct btrfs_root *root,
198                                  struct btrfs_block_group_cache *cache)
199 {
200         u64 bytenr;
201         u64 *logical;
202         int stripe_len;
203         int i, nr, ret;
204
205         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
206                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
207                 cache->bytes_super += stripe_len;
208                 ret = add_excluded_extent(root, cache->key.objectid,
209                                           stripe_len);
210                 BUG_ON(ret);
211         }
212
213         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
214                 bytenr = btrfs_sb_offset(i);
215                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
216                                        cache->key.objectid, bytenr,
217                                        0, &logical, &nr, &stripe_len);
218                 BUG_ON(ret);
219
220                 while (nr--) {
221                         cache->bytes_super += stripe_len;
222                         ret = add_excluded_extent(root, logical[nr],
223                                                   stripe_len);
224                         BUG_ON(ret);
225                 }
226
227                 kfree(logical);
228         }
229         return 0;
230 }
231
232 static struct btrfs_caching_control *
233 get_caching_control(struct btrfs_block_group_cache *cache)
234 {
235         struct btrfs_caching_control *ctl;
236
237         spin_lock(&cache->lock);
238         if (cache->cached != BTRFS_CACHE_STARTED) {
239                 spin_unlock(&cache->lock);
240                 return NULL;
241         }
242
243         /* We're loading it the fast way, so we don't have a caching_ctl. */
244         if (!cache->caching_ctl) {
245                 spin_unlock(&cache->lock);
246                 return NULL;
247         }
248
249         ctl = cache->caching_ctl;
250         atomic_inc(&ctl->count);
251         spin_unlock(&cache->lock);
252         return ctl;
253 }
254
255 static void put_caching_control(struct btrfs_caching_control *ctl)
256 {
257         if (atomic_dec_and_test(&ctl->count))
258                 kfree(ctl);
259 }
260
261 /*
262  * this is only called by cache_block_group, since we could have freed extents
263  * we need to check the pinned_extents for any extents that can't be used yet
264  * since their free space will be released as soon as the transaction commits.
265  */
266 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
267                               struct btrfs_fs_info *info, u64 start, u64 end)
268 {
269         u64 extent_start, extent_end, size, total_added = 0;
270         int ret;
271
272         while (start < end) {
273                 ret = find_first_extent_bit(info->pinned_extents, start,
274                                             &extent_start, &extent_end,
275                                             EXTENT_DIRTY | EXTENT_UPTODATE);
276                 if (ret)
277                         break;
278
279                 if (extent_start <= start) {
280                         start = extent_end + 1;
281                 } else if (extent_start > start && extent_start < end) {
282                         size = extent_start - start;
283                         total_added += size;
284                         ret = btrfs_add_free_space(block_group, start,
285                                                    size);
286                         BUG_ON(ret);
287                         start = extent_end + 1;
288                 } else {
289                         break;
290                 }
291         }
292
293         if (start < end) {
294                 size = end - start;
295                 total_added += size;
296                 ret = btrfs_add_free_space(block_group, start, size);
297                 BUG_ON(ret);
298         }
299
300         return total_added;
301 }
302
303 static int caching_kthread(void *data)
304 {
305         struct btrfs_block_group_cache *block_group = data;
306         struct btrfs_fs_info *fs_info = block_group->fs_info;
307         struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
308         struct btrfs_root *extent_root = fs_info->extent_root;
309         struct btrfs_path *path;
310         struct extent_buffer *leaf;
311         struct btrfs_key key;
312         u64 total_found = 0;
313         u64 last = 0;
314         u32 nritems;
315         int ret = 0;
316
317         path = btrfs_alloc_path();
318         if (!path)
319                 return -ENOMEM;
320
321         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
322
323         /*
324          * We don't want to deadlock with somebody trying to allocate a new
325          * extent for the extent root while also trying to search the extent
326          * root to add free space.  So we skip locking and search the commit
327          * root, since its read-only
328          */
329         path->skip_locking = 1;
330         path->search_commit_root = 1;
331         path->reada = 2;
332
333         key.objectid = last;
334         key.offset = 0;
335         key.type = BTRFS_EXTENT_ITEM_KEY;
336 again:
337         mutex_lock(&caching_ctl->mutex);
338         /* need to make sure the commit_root doesn't disappear */
339         down_read(&fs_info->extent_commit_sem);
340
341         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
342         if (ret < 0)
343                 goto err;
344
345         leaf = path->nodes[0];
346         nritems = btrfs_header_nritems(leaf);
347
348         while (1) {
349                 smp_mb();
350                 if (fs_info->closing > 1) {
351                         last = (u64)-1;
352                         break;
353                 }
354
355                 if (path->slots[0] < nritems) {
356                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
357                 } else {
358                         ret = find_next_key(path, 0, &key);
359                         if (ret)
360                                 break;
361
362                         caching_ctl->progress = last;
363                         btrfs_release_path(extent_root, path);
364                         up_read(&fs_info->extent_commit_sem);
365                         mutex_unlock(&caching_ctl->mutex);
366                         if (btrfs_transaction_in_commit(fs_info))
367                                 schedule_timeout(1);
368                         else
369                                 cond_resched();
370                         goto again;
371                 }
372
373                 if (key.objectid < block_group->key.objectid) {
374                         path->slots[0]++;
375                         continue;
376                 }
377
378                 if (key.objectid >= block_group->key.objectid +
379                     block_group->key.offset)
380                         break;
381
382                 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
383                         total_found += add_new_free_space(block_group,
384                                                           fs_info, last,
385                                                           key.objectid);
386                         last = key.objectid + key.offset;
387
388                         if (total_found > (1024 * 1024 * 2)) {
389                                 total_found = 0;
390                                 wake_up(&caching_ctl->wait);
391                         }
392                 }
393                 path->slots[0]++;
394         }
395         ret = 0;
396
397         total_found += add_new_free_space(block_group, fs_info, last,
398                                           block_group->key.objectid +
399                                           block_group->key.offset);
400         caching_ctl->progress = (u64)-1;
401
402         spin_lock(&block_group->lock);
403         block_group->caching_ctl = NULL;
404         block_group->cached = BTRFS_CACHE_FINISHED;
405         spin_unlock(&block_group->lock);
406
407 err:
408         btrfs_free_path(path);
409         up_read(&fs_info->extent_commit_sem);
410
411         free_excluded_extents(extent_root, block_group);
412
413         mutex_unlock(&caching_ctl->mutex);
414         wake_up(&caching_ctl->wait);
415
416         put_caching_control(caching_ctl);
417         atomic_dec(&block_group->space_info->caching_threads);
418         btrfs_put_block_group(block_group);
419
420         return 0;
421 }
422
423 static int cache_block_group(struct btrfs_block_group_cache *cache,
424                              struct btrfs_trans_handle *trans,
425                              struct btrfs_root *root,
426                              int load_cache_only)
427 {
428         struct btrfs_fs_info *fs_info = cache->fs_info;
429         struct btrfs_caching_control *caching_ctl;
430         struct task_struct *tsk;
431         int ret = 0;
432
433         smp_mb();
434         if (cache->cached != BTRFS_CACHE_NO)
435                 return 0;
436
437         /*
438          * We can't do the read from on-disk cache during a commit since we need
439          * to have the normal tree locking.  Also if we are currently trying to
440          * allocate blocks for the tree root we can't do the fast caching since
441          * we likely hold important locks.
442          */
443         if (trans && (!trans->transaction->in_commit) &&
444             (root && root != root->fs_info->tree_root)) {
445                 spin_lock(&cache->lock);
446                 if (cache->cached != BTRFS_CACHE_NO) {
447                         spin_unlock(&cache->lock);
448                         return 0;
449                 }
450                 cache->cached = BTRFS_CACHE_STARTED;
451                 spin_unlock(&cache->lock);
452
453                 ret = load_free_space_cache(fs_info, cache);
454
455                 spin_lock(&cache->lock);
456                 if (ret == 1) {
457                         cache->cached = BTRFS_CACHE_FINISHED;
458                         cache->last_byte_to_unpin = (u64)-1;
459                 } else {
460                         cache->cached = BTRFS_CACHE_NO;
461                 }
462                 spin_unlock(&cache->lock);
463                 if (ret == 1) {
464                         free_excluded_extents(fs_info->extent_root, cache);
465                         return 0;
466                 }
467         }
468
469         if (load_cache_only)
470                 return 0;
471
472         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
473         BUG_ON(!caching_ctl);
474
475         INIT_LIST_HEAD(&caching_ctl->list);
476         mutex_init(&caching_ctl->mutex);
477         init_waitqueue_head(&caching_ctl->wait);
478         caching_ctl->block_group = cache;
479         caching_ctl->progress = cache->key.objectid;
480         /* one for caching kthread, one for caching block group list */
481         atomic_set(&caching_ctl->count, 2);
482
483         spin_lock(&cache->lock);
484         if (cache->cached != BTRFS_CACHE_NO) {
485                 spin_unlock(&cache->lock);
486                 kfree(caching_ctl);
487                 return 0;
488         }
489         cache->caching_ctl = caching_ctl;
490         cache->cached = BTRFS_CACHE_STARTED;
491         spin_unlock(&cache->lock);
492
493         down_write(&fs_info->extent_commit_sem);
494         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
495         up_write(&fs_info->extent_commit_sem);
496
497         atomic_inc(&cache->space_info->caching_threads);
498         btrfs_get_block_group(cache);
499
500         tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
501                           cache->key.objectid);
502         if (IS_ERR(tsk)) {
503                 ret = PTR_ERR(tsk);
504                 printk(KERN_ERR "error running thread %d\n", ret);
505                 BUG();
506         }
507
508         return ret;
509 }
510
511 /*
512  * return the block group that starts at or after bytenr
513  */
514 static struct btrfs_block_group_cache *
515 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
516 {
517         struct btrfs_block_group_cache *cache;
518
519         cache = block_group_cache_tree_search(info, bytenr, 0);
520
521         return cache;
522 }
523
524 /*
525  * return the block group that contains the given bytenr
526  */
527 struct btrfs_block_group_cache *btrfs_lookup_block_group(
528                                                  struct btrfs_fs_info *info,
529                                                  u64 bytenr)
530 {
531         struct btrfs_block_group_cache *cache;
532
533         cache = block_group_cache_tree_search(info, bytenr, 1);
534
535         return cache;
536 }
537
538 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
539                                                   u64 flags)
540 {
541         struct list_head *head = &info->space_info;
542         struct btrfs_space_info *found;
543
544         flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
545                  BTRFS_BLOCK_GROUP_METADATA;
546
547         rcu_read_lock();
548         list_for_each_entry_rcu(found, head, list) {
549                 if (found->flags & flags) {
550                         rcu_read_unlock();
551                         return found;
552                 }
553         }
554         rcu_read_unlock();
555         return NULL;
556 }
557
558 /*
559  * after adding space to the filesystem, we need to clear the full flags
560  * on all the space infos.
561  */
562 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
563 {
564         struct list_head *head = &info->space_info;
565         struct btrfs_space_info *found;
566
567         rcu_read_lock();
568         list_for_each_entry_rcu(found, head, list)
569                 found->full = 0;
570         rcu_read_unlock();
571 }
572
573 static u64 div_factor(u64 num, int factor)
574 {
575         if (factor == 10)
576                 return num;
577         num *= factor;
578         do_div(num, 10);
579         return num;
580 }
581
582 static u64 div_factor_fine(u64 num, int factor)
583 {
584         if (factor == 100)
585                 return num;
586         num *= factor;
587         do_div(num, 100);
588         return num;
589 }
590
591 u64 btrfs_find_block_group(struct btrfs_root *root,
592                            u64 search_start, u64 search_hint, int owner)
593 {
594         struct btrfs_block_group_cache *cache;
595         u64 used;
596         u64 last = max(search_hint, search_start);
597         u64 group_start = 0;
598         int full_search = 0;
599         int factor = 9;
600         int wrapped = 0;
601 again:
602         while (1) {
603                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
604                 if (!cache)
605                         break;
606
607                 spin_lock(&cache->lock);
608                 last = cache->key.objectid + cache->key.offset;
609                 used = btrfs_block_group_used(&cache->item);
610
611                 if ((full_search || !cache->ro) &&
612                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
613                         if (used + cache->pinned + cache->reserved <
614                             div_factor(cache->key.offset, factor)) {
615                                 group_start = cache->key.objectid;
616                                 spin_unlock(&cache->lock);
617                                 btrfs_put_block_group(cache);
618                                 goto found;
619                         }
620                 }
621                 spin_unlock(&cache->lock);
622                 btrfs_put_block_group(cache);
623                 cond_resched();
624         }
625         if (!wrapped) {
626                 last = search_start;
627                 wrapped = 1;
628                 goto again;
629         }
630         if (!full_search && factor < 10) {
631                 last = search_start;
632                 full_search = 1;
633                 factor = 10;
634                 goto again;
635         }
636 found:
637         return group_start;
638 }
639
640 /* simple helper to search for an existing extent at a given offset */
641 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
642 {
643         int ret;
644         struct btrfs_key key;
645         struct btrfs_path *path;
646
647         path = btrfs_alloc_path();
648         BUG_ON(!path);
649         key.objectid = start;
650         key.offset = len;
651         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
652         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
653                                 0, 0);
654         btrfs_free_path(path);
655         return ret;
656 }
657
658 /*
659  * helper function to lookup reference count and flags of extent.
660  *
661  * the head node for delayed ref is used to store the sum of all the
662  * reference count modifications queued up in the rbtree. the head
663  * node may also store the extent flags to set. This way you can check
664  * to see what the reference count and extent flags would be if all of
665  * the delayed refs are not processed.
666  */
667 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
668                              struct btrfs_root *root, u64 bytenr,
669                              u64 num_bytes, u64 *refs, u64 *flags)
670 {
671         struct btrfs_delayed_ref_head *head;
672         struct btrfs_delayed_ref_root *delayed_refs;
673         struct btrfs_path *path;
674         struct btrfs_extent_item *ei;
675         struct extent_buffer *leaf;
676         struct btrfs_key key;
677         u32 item_size;
678         u64 num_refs;
679         u64 extent_flags;
680         int ret;
681
682         path = btrfs_alloc_path();
683         if (!path)
684                 return -ENOMEM;
685
686         key.objectid = bytenr;
687         key.type = BTRFS_EXTENT_ITEM_KEY;
688         key.offset = num_bytes;
689         if (!trans) {
690                 path->skip_locking = 1;
691                 path->search_commit_root = 1;
692         }
693 again:
694         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
695                                 &key, path, 0, 0);
696         if (ret < 0)
697                 goto out_free;
698
699         if (ret == 0) {
700                 leaf = path->nodes[0];
701                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
702                 if (item_size >= sizeof(*ei)) {
703                         ei = btrfs_item_ptr(leaf, path->slots[0],
704                                             struct btrfs_extent_item);
705                         num_refs = btrfs_extent_refs(leaf, ei);
706                         extent_flags = btrfs_extent_flags(leaf, ei);
707                 } else {
708 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
709                         struct btrfs_extent_item_v0 *ei0;
710                         BUG_ON(item_size != sizeof(*ei0));
711                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
712                                              struct btrfs_extent_item_v0);
713                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
714                         /* FIXME: this isn't correct for data */
715                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
716 #else
717                         BUG();
718 #endif
719                 }
720                 BUG_ON(num_refs == 0);
721         } else {
722                 num_refs = 0;
723                 extent_flags = 0;
724                 ret = 0;
725         }
726
727         if (!trans)
728                 goto out;
729
730         delayed_refs = &trans->transaction->delayed_refs;
731         spin_lock(&delayed_refs->lock);
732         head = btrfs_find_delayed_ref_head(trans, bytenr);
733         if (head) {
734                 if (!mutex_trylock(&head->mutex)) {
735                         atomic_inc(&head->node.refs);
736                         spin_unlock(&delayed_refs->lock);
737
738                         btrfs_release_path(root->fs_info->extent_root, path);
739
740                         mutex_lock(&head->mutex);
741                         mutex_unlock(&head->mutex);
742                         btrfs_put_delayed_ref(&head->node);
743                         goto again;
744                 }
745                 if (head->extent_op && head->extent_op->update_flags)
746                         extent_flags |= head->extent_op->flags_to_set;
747                 else
748                         BUG_ON(num_refs == 0);
749
750                 num_refs += head->node.ref_mod;
751                 mutex_unlock(&head->mutex);
752         }
753         spin_unlock(&delayed_refs->lock);
754 out:
755         WARN_ON(num_refs == 0);
756         if (refs)
757                 *refs = num_refs;
758         if (flags)
759                 *flags = extent_flags;
760 out_free:
761         btrfs_free_path(path);
762         return ret;
763 }
764
765 /*
766  * Back reference rules.  Back refs have three main goals:
767  *
768  * 1) differentiate between all holders of references to an extent so that
769  *    when a reference is dropped we can make sure it was a valid reference
770  *    before freeing the extent.
771  *
772  * 2) Provide enough information to quickly find the holders of an extent
773  *    if we notice a given block is corrupted or bad.
774  *
775  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
776  *    maintenance.  This is actually the same as #2, but with a slightly
777  *    different use case.
778  *
779  * There are two kinds of back refs. The implicit back refs is optimized
780  * for pointers in non-shared tree blocks. For a given pointer in a block,
781  * back refs of this kind provide information about the block's owner tree
782  * and the pointer's key. These information allow us to find the block by
783  * b-tree searching. The full back refs is for pointers in tree blocks not
784  * referenced by their owner trees. The location of tree block is recorded
785  * in the back refs. Actually the full back refs is generic, and can be
786  * used in all cases the implicit back refs is used. The major shortcoming
787  * of the full back refs is its overhead. Every time a tree block gets
788  * COWed, we have to update back refs entry for all pointers in it.
789  *
790  * For a newly allocated tree block, we use implicit back refs for
791  * pointers in it. This means most tree related operations only involve
792  * implicit back refs. For a tree block created in old transaction, the
793  * only way to drop a reference to it is COW it. So we can detect the
794  * event that tree block loses its owner tree's reference and do the
795  * back refs conversion.
796  *
797  * When a tree block is COW'd through a tree, there are four cases:
798  *
799  * The reference count of the block is one and the tree is the block's
800  * owner tree. Nothing to do in this case.
801  *
802  * The reference count of the block is one and the tree is not the
803  * block's owner tree. In this case, full back refs is used for pointers
804  * in the block. Remove these full back refs, add implicit back refs for
805  * every pointers in the new block.
806  *
807  * The reference count of the block is greater than one and the tree is
808  * the block's owner tree. In this case, implicit back refs is used for
809  * pointers in the block. Add full back refs for every pointers in the
810  * block, increase lower level extents' reference counts. The original
811  * implicit back refs are entailed to the new block.
812  *
813  * The reference count of the block is greater than one and the tree is
814  * not the block's owner tree. Add implicit back refs for every pointer in
815  * the new block, increase lower level extents' reference count.
816  *
817  * Back Reference Key composing:
818  *
819  * The key objectid corresponds to the first byte in the extent,
820  * The key type is used to differentiate between types of back refs.
821  * There are different meanings of the key offset for different types
822  * of back refs.
823  *
824  * File extents can be referenced by:
825  *
826  * - multiple snapshots, subvolumes, or different generations in one subvol
827  * - different files inside a single subvolume
828  * - different offsets inside a file (bookend extents in file.c)
829  *
830  * The extent ref structure for the implicit back refs has fields for:
831  *
832  * - Objectid of the subvolume root
833  * - objectid of the file holding the reference
834  * - original offset in the file
835  * - how many bookend extents
836  *
837  * The key offset for the implicit back refs is hash of the first
838  * three fields.
839  *
840  * The extent ref structure for the full back refs has field for:
841  *
842  * - number of pointers in the tree leaf
843  *
844  * The key offset for the implicit back refs is the first byte of
845  * the tree leaf
846  *
847  * When a file extent is allocated, The implicit back refs is used.
848  * the fields are filled in:
849  *
850  *     (root_key.objectid, inode objectid, offset in file, 1)
851  *
852  * When a file extent is removed file truncation, we find the
853  * corresponding implicit back refs and check the following fields:
854  *
855  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
856  *
857  * Btree extents can be referenced by:
858  *
859  * - Different subvolumes
860  *
861  * Both the implicit back refs and the full back refs for tree blocks
862  * only consist of key. The key offset for the implicit back refs is
863  * objectid of block's owner tree. The key offset for the full back refs
864  * is the first byte of parent block.
865  *
866  * When implicit back refs is used, information about the lowest key and
867  * level of the tree block are required. These information are stored in
868  * tree block info structure.
869  */
870
871 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
872 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
873                                   struct btrfs_root *root,
874                                   struct btrfs_path *path,
875                                   u64 owner, u32 extra_size)
876 {
877         struct btrfs_extent_item *item;
878         struct btrfs_extent_item_v0 *ei0;
879         struct btrfs_extent_ref_v0 *ref0;
880         struct btrfs_tree_block_info *bi;
881         struct extent_buffer *leaf;
882         struct btrfs_key key;
883         struct btrfs_key found_key;
884         u32 new_size = sizeof(*item);
885         u64 refs;
886         int ret;
887
888         leaf = path->nodes[0];
889         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
890
891         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
892         ei0 = btrfs_item_ptr(leaf, path->slots[0],
893                              struct btrfs_extent_item_v0);
894         refs = btrfs_extent_refs_v0(leaf, ei0);
895
896         if (owner == (u64)-1) {
897                 while (1) {
898                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
899                                 ret = btrfs_next_leaf(root, path);
900                                 if (ret < 0)
901                                         return ret;
902                                 BUG_ON(ret > 0);
903                                 leaf = path->nodes[0];
904                         }
905                         btrfs_item_key_to_cpu(leaf, &found_key,
906                                               path->slots[0]);
907                         BUG_ON(key.objectid != found_key.objectid);
908                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
909                                 path->slots[0]++;
910                                 continue;
911                         }
912                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
913                                               struct btrfs_extent_ref_v0);
914                         owner = btrfs_ref_objectid_v0(leaf, ref0);
915                         break;
916                 }
917         }
918         btrfs_release_path(root, path);
919
920         if (owner < BTRFS_FIRST_FREE_OBJECTID)
921                 new_size += sizeof(*bi);
922
923         new_size -= sizeof(*ei0);
924         ret = btrfs_search_slot(trans, root, &key, path,
925                                 new_size + extra_size, 1);
926         if (ret < 0)
927                 return ret;
928         BUG_ON(ret);
929
930         ret = btrfs_extend_item(trans, root, path, new_size);
931         BUG_ON(ret);
932
933         leaf = path->nodes[0];
934         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
935         btrfs_set_extent_refs(leaf, item, refs);
936         /* FIXME: get real generation */
937         btrfs_set_extent_generation(leaf, item, 0);
938         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
939                 btrfs_set_extent_flags(leaf, item,
940                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
941                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
942                 bi = (struct btrfs_tree_block_info *)(item + 1);
943                 /* FIXME: get first key of the block */
944                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
945                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
946         } else {
947                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
948         }
949         btrfs_mark_buffer_dirty(leaf);
950         return 0;
951 }
952 #endif
953
954 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
955 {
956         u32 high_crc = ~(u32)0;
957         u32 low_crc = ~(u32)0;
958         __le64 lenum;
959
960         lenum = cpu_to_le64(root_objectid);
961         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
962         lenum = cpu_to_le64(owner);
963         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
964         lenum = cpu_to_le64(offset);
965         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
966
967         return ((u64)high_crc << 31) ^ (u64)low_crc;
968 }
969
970 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
971                                      struct btrfs_extent_data_ref *ref)
972 {
973         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
974                                     btrfs_extent_data_ref_objectid(leaf, ref),
975                                     btrfs_extent_data_ref_offset(leaf, ref));
976 }
977
978 static int match_extent_data_ref(struct extent_buffer *leaf,
979                                  struct btrfs_extent_data_ref *ref,
980                                  u64 root_objectid, u64 owner, u64 offset)
981 {
982         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
983             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
984             btrfs_extent_data_ref_offset(leaf, ref) != offset)
985                 return 0;
986         return 1;
987 }
988
989 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
990                                            struct btrfs_root *root,
991                                            struct btrfs_path *path,
992                                            u64 bytenr, u64 parent,
993                                            u64 root_objectid,
994                                            u64 owner, u64 offset)
995 {
996         struct btrfs_key key;
997         struct btrfs_extent_data_ref *ref;
998         struct extent_buffer *leaf;
999         u32 nritems;
1000         int ret;
1001         int recow;
1002         int err = -ENOENT;
1003
1004         key.objectid = bytenr;
1005         if (parent) {
1006                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1007                 key.offset = parent;
1008         } else {
1009                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1010                 key.offset = hash_extent_data_ref(root_objectid,
1011                                                   owner, offset);
1012         }
1013 again:
1014         recow = 0;
1015         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1016         if (ret < 0) {
1017                 err = ret;
1018                 goto fail;
1019         }
1020
1021         if (parent) {
1022                 if (!ret)
1023                         return 0;
1024 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1025                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1026                 btrfs_release_path(root, path);
1027                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1028                 if (ret < 0) {
1029                         err = ret;
1030                         goto fail;
1031                 }
1032                 if (!ret)
1033                         return 0;
1034 #endif
1035                 goto fail;
1036         }
1037
1038         leaf = path->nodes[0];
1039         nritems = btrfs_header_nritems(leaf);
1040         while (1) {
1041                 if (path->slots[0] >= nritems) {
1042                         ret = btrfs_next_leaf(root, path);
1043                         if (ret < 0)
1044                                 err = ret;
1045                         if (ret)
1046                                 goto fail;
1047
1048                         leaf = path->nodes[0];
1049                         nritems = btrfs_header_nritems(leaf);
1050                         recow = 1;
1051                 }
1052
1053                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1054                 if (key.objectid != bytenr ||
1055                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1056                         goto fail;
1057
1058                 ref = btrfs_item_ptr(leaf, path->slots[0],
1059                                      struct btrfs_extent_data_ref);
1060
1061                 if (match_extent_data_ref(leaf, ref, root_objectid,
1062                                           owner, offset)) {
1063                         if (recow) {
1064                                 btrfs_release_path(root, path);
1065                                 goto again;
1066                         }
1067                         err = 0;
1068                         break;
1069                 }
1070                 path->slots[0]++;
1071         }
1072 fail:
1073         return err;
1074 }
1075
1076 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1077                                            struct btrfs_root *root,
1078                                            struct btrfs_path *path,
1079                                            u64 bytenr, u64 parent,
1080                                            u64 root_objectid, u64 owner,
1081                                            u64 offset, int refs_to_add)
1082 {
1083         struct btrfs_key key;
1084         struct extent_buffer *leaf;
1085         u32 size;
1086         u32 num_refs;
1087         int ret;
1088
1089         key.objectid = bytenr;
1090         if (parent) {
1091                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1092                 key.offset = parent;
1093                 size = sizeof(struct btrfs_shared_data_ref);
1094         } else {
1095                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1096                 key.offset = hash_extent_data_ref(root_objectid,
1097                                                   owner, offset);
1098                 size = sizeof(struct btrfs_extent_data_ref);
1099         }
1100
1101         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1102         if (ret && ret != -EEXIST)
1103                 goto fail;
1104
1105         leaf = path->nodes[0];
1106         if (parent) {
1107                 struct btrfs_shared_data_ref *ref;
1108                 ref = btrfs_item_ptr(leaf, path->slots[0],
1109                                      struct btrfs_shared_data_ref);
1110                 if (ret == 0) {
1111                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1112                 } else {
1113                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1114                         num_refs += refs_to_add;
1115                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1116                 }
1117         } else {
1118                 struct btrfs_extent_data_ref *ref;
1119                 while (ret == -EEXIST) {
1120                         ref = btrfs_item_ptr(leaf, path->slots[0],
1121                                              struct btrfs_extent_data_ref);
1122                         if (match_extent_data_ref(leaf, ref, root_objectid,
1123                                                   owner, offset))
1124                                 break;
1125                         btrfs_release_path(root, path);
1126                         key.offset++;
1127                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1128                                                       size);
1129                         if (ret && ret != -EEXIST)
1130                                 goto fail;
1131
1132                         leaf = path->nodes[0];
1133                 }
1134                 ref = btrfs_item_ptr(leaf, path->slots[0],
1135                                      struct btrfs_extent_data_ref);
1136                 if (ret == 0) {
1137                         btrfs_set_extent_data_ref_root(leaf, ref,
1138                                                        root_objectid);
1139                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1140                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1141                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1142                 } else {
1143                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1144                         num_refs += refs_to_add;
1145                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1146                 }
1147         }
1148         btrfs_mark_buffer_dirty(leaf);
1149         ret = 0;
1150 fail:
1151         btrfs_release_path(root, path);
1152         return ret;
1153 }
1154
1155 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1156                                            struct btrfs_root *root,
1157                                            struct btrfs_path *path,
1158                                            int refs_to_drop)
1159 {
1160         struct btrfs_key key;
1161         struct btrfs_extent_data_ref *ref1 = NULL;
1162         struct btrfs_shared_data_ref *ref2 = NULL;
1163         struct extent_buffer *leaf;
1164         u32 num_refs = 0;
1165         int ret = 0;
1166
1167         leaf = path->nodes[0];
1168         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1169
1170         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1171                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1172                                       struct btrfs_extent_data_ref);
1173                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1174         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1175                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1176                                       struct btrfs_shared_data_ref);
1177                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1178 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1179         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1180                 struct btrfs_extent_ref_v0 *ref0;
1181                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1182                                       struct btrfs_extent_ref_v0);
1183                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1184 #endif
1185         } else {
1186                 BUG();
1187         }
1188
1189         BUG_ON(num_refs < refs_to_drop);
1190         num_refs -= refs_to_drop;
1191
1192         if (num_refs == 0) {
1193                 ret = btrfs_del_item(trans, root, path);
1194         } else {
1195                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1196                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1197                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1198                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1199 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1200                 else {
1201                         struct btrfs_extent_ref_v0 *ref0;
1202                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1203                                         struct btrfs_extent_ref_v0);
1204                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1205                 }
1206 #endif
1207                 btrfs_mark_buffer_dirty(leaf);
1208         }
1209         return ret;
1210 }
1211
1212 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1213                                           struct btrfs_path *path,
1214                                           struct btrfs_extent_inline_ref *iref)
1215 {
1216         struct btrfs_key key;
1217         struct extent_buffer *leaf;
1218         struct btrfs_extent_data_ref *ref1;
1219         struct btrfs_shared_data_ref *ref2;
1220         u32 num_refs = 0;
1221
1222         leaf = path->nodes[0];
1223         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1224         if (iref) {
1225                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1226                     BTRFS_EXTENT_DATA_REF_KEY) {
1227                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1228                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1229                 } else {
1230                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1231                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1232                 }
1233         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1234                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1235                                       struct btrfs_extent_data_ref);
1236                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1237         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1238                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1239                                       struct btrfs_shared_data_ref);
1240                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1241 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1242         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1243                 struct btrfs_extent_ref_v0 *ref0;
1244                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1245                                       struct btrfs_extent_ref_v0);
1246                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1247 #endif
1248         } else {
1249                 WARN_ON(1);
1250         }
1251         return num_refs;
1252 }
1253
1254 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1255                                           struct btrfs_root *root,
1256                                           struct btrfs_path *path,
1257                                           u64 bytenr, u64 parent,
1258                                           u64 root_objectid)
1259 {
1260         struct btrfs_key key;
1261         int ret;
1262
1263         key.objectid = bytenr;
1264         if (parent) {
1265                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1266                 key.offset = parent;
1267         } else {
1268                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1269                 key.offset = root_objectid;
1270         }
1271
1272         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1273         if (ret > 0)
1274                 ret = -ENOENT;
1275 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1276         if (ret == -ENOENT && parent) {
1277                 btrfs_release_path(root, path);
1278                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1279                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1280                 if (ret > 0)
1281                         ret = -ENOENT;
1282         }
1283 #endif
1284         return ret;
1285 }
1286
1287 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1288                                           struct btrfs_root *root,
1289                                           struct btrfs_path *path,
1290                                           u64 bytenr, u64 parent,
1291                                           u64 root_objectid)
1292 {
1293         struct btrfs_key key;
1294         int ret;
1295
1296         key.objectid = bytenr;
1297         if (parent) {
1298                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1299                 key.offset = parent;
1300         } else {
1301                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1302                 key.offset = root_objectid;
1303         }
1304
1305         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1306         btrfs_release_path(root, path);
1307         return ret;
1308 }
1309
1310 static inline int extent_ref_type(u64 parent, u64 owner)
1311 {
1312         int type;
1313         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1314                 if (parent > 0)
1315                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1316                 else
1317                         type = BTRFS_TREE_BLOCK_REF_KEY;
1318         } else {
1319                 if (parent > 0)
1320                         type = BTRFS_SHARED_DATA_REF_KEY;
1321                 else
1322                         type = BTRFS_EXTENT_DATA_REF_KEY;
1323         }
1324         return type;
1325 }
1326
1327 static int find_next_key(struct btrfs_path *path, int level,
1328                          struct btrfs_key *key)
1329
1330 {
1331         for (; level < BTRFS_MAX_LEVEL; level++) {
1332                 if (!path->nodes[level])
1333                         break;
1334                 if (path->slots[level] + 1 >=
1335                     btrfs_header_nritems(path->nodes[level]))
1336                         continue;
1337                 if (level == 0)
1338                         btrfs_item_key_to_cpu(path->nodes[level], key,
1339                                               path->slots[level] + 1);
1340                 else
1341                         btrfs_node_key_to_cpu(path->nodes[level], key,
1342                                               path->slots[level] + 1);
1343                 return 0;
1344         }
1345         return 1;
1346 }
1347
1348 /*
1349  * look for inline back ref. if back ref is found, *ref_ret is set
1350  * to the address of inline back ref, and 0 is returned.
1351  *
1352  * if back ref isn't found, *ref_ret is set to the address where it
1353  * should be inserted, and -ENOENT is returned.
1354  *
1355  * if insert is true and there are too many inline back refs, the path
1356  * points to the extent item, and -EAGAIN is returned.
1357  *
1358  * NOTE: inline back refs are ordered in the same way that back ref
1359  *       items in the tree are ordered.
1360  */
1361 static noinline_for_stack
1362 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1363                                  struct btrfs_root *root,
1364                                  struct btrfs_path *path,
1365                                  struct btrfs_extent_inline_ref **ref_ret,
1366                                  u64 bytenr, u64 num_bytes,
1367                                  u64 parent, u64 root_objectid,
1368                                  u64 owner, u64 offset, int insert)
1369 {
1370         struct btrfs_key key;
1371         struct extent_buffer *leaf;
1372         struct btrfs_extent_item *ei;
1373         struct btrfs_extent_inline_ref *iref;
1374         u64 flags;
1375         u64 item_size;
1376         unsigned long ptr;
1377         unsigned long end;
1378         int extra_size;
1379         int type;
1380         int want;
1381         int ret;
1382         int err = 0;
1383
1384         key.objectid = bytenr;
1385         key.type = BTRFS_EXTENT_ITEM_KEY;
1386         key.offset = num_bytes;
1387
1388         want = extent_ref_type(parent, owner);
1389         if (insert) {
1390                 extra_size = btrfs_extent_inline_ref_size(want);
1391                 path->keep_locks = 1;
1392         } else
1393                 extra_size = -1;
1394         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1395         if (ret < 0) {
1396                 err = ret;
1397                 goto out;
1398         }
1399         BUG_ON(ret);
1400
1401         leaf = path->nodes[0];
1402         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1403 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1404         if (item_size < sizeof(*ei)) {
1405                 if (!insert) {
1406                         err = -ENOENT;
1407                         goto out;
1408                 }
1409                 ret = convert_extent_item_v0(trans, root, path, owner,
1410                                              extra_size);
1411                 if (ret < 0) {
1412                         err = ret;
1413                         goto out;
1414                 }
1415                 leaf = path->nodes[0];
1416                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1417         }
1418 #endif
1419         BUG_ON(item_size < sizeof(*ei));
1420
1421         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1422         flags = btrfs_extent_flags(leaf, ei);
1423
1424         ptr = (unsigned long)(ei + 1);
1425         end = (unsigned long)ei + item_size;
1426
1427         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1428                 ptr += sizeof(struct btrfs_tree_block_info);
1429                 BUG_ON(ptr > end);
1430         } else {
1431                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1432         }
1433
1434         err = -ENOENT;
1435         while (1) {
1436                 if (ptr >= end) {
1437                         WARN_ON(ptr > end);
1438                         break;
1439                 }
1440                 iref = (struct btrfs_extent_inline_ref *)ptr;
1441                 type = btrfs_extent_inline_ref_type(leaf, iref);
1442                 if (want < type)
1443                         break;
1444                 if (want > type) {
1445                         ptr += btrfs_extent_inline_ref_size(type);
1446                         continue;
1447                 }
1448
1449                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1450                         struct btrfs_extent_data_ref *dref;
1451                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1452                         if (match_extent_data_ref(leaf, dref, root_objectid,
1453                                                   owner, offset)) {
1454                                 err = 0;
1455                                 break;
1456                         }
1457                         if (hash_extent_data_ref_item(leaf, dref) <
1458                             hash_extent_data_ref(root_objectid, owner, offset))
1459                                 break;
1460                 } else {
1461                         u64 ref_offset;
1462                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1463                         if (parent > 0) {
1464                                 if (parent == ref_offset) {
1465                                         err = 0;
1466                                         break;
1467                                 }
1468                                 if (ref_offset < parent)
1469                                         break;
1470                         } else {
1471                                 if (root_objectid == ref_offset) {
1472                                         err = 0;
1473                                         break;
1474                                 }
1475                                 if (ref_offset < root_objectid)
1476                                         break;
1477                         }
1478                 }
1479                 ptr += btrfs_extent_inline_ref_size(type);
1480         }
1481         if (err == -ENOENT && insert) {
1482                 if (item_size + extra_size >=
1483                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1484                         err = -EAGAIN;
1485                         goto out;
1486                 }
1487                 /*
1488                  * To add new inline back ref, we have to make sure
1489                  * there is no corresponding back ref item.
1490                  * For simplicity, we just do not add new inline back
1491                  * ref if there is any kind of item for this block
1492                  */
1493                 if (find_next_key(path, 0, &key) == 0 &&
1494                     key.objectid == bytenr &&
1495                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1496                         err = -EAGAIN;
1497                         goto out;
1498                 }
1499         }
1500         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1501 out:
1502         if (insert) {
1503                 path->keep_locks = 0;
1504                 btrfs_unlock_up_safe(path, 1);
1505         }
1506         return err;
1507 }
1508
1509 /*
1510  * helper to add new inline back ref
1511  */
1512 static noinline_for_stack
1513 int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1514                                 struct btrfs_root *root,
1515                                 struct btrfs_path *path,
1516                                 struct btrfs_extent_inline_ref *iref,
1517                                 u64 parent, u64 root_objectid,
1518                                 u64 owner, u64 offset, int refs_to_add,
1519                                 struct btrfs_delayed_extent_op *extent_op)
1520 {
1521         struct extent_buffer *leaf;
1522         struct btrfs_extent_item *ei;
1523         unsigned long ptr;
1524         unsigned long end;
1525         unsigned long item_offset;
1526         u64 refs;
1527         int size;
1528         int type;
1529         int ret;
1530
1531         leaf = path->nodes[0];
1532         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1533         item_offset = (unsigned long)iref - (unsigned long)ei;
1534
1535         type = extent_ref_type(parent, owner);
1536         size = btrfs_extent_inline_ref_size(type);
1537
1538         ret = btrfs_extend_item(trans, root, path, size);
1539         BUG_ON(ret);
1540
1541         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1542         refs = btrfs_extent_refs(leaf, ei);
1543         refs += refs_to_add;
1544         btrfs_set_extent_refs(leaf, ei, refs);
1545         if (extent_op)
1546                 __run_delayed_extent_op(extent_op, leaf, ei);
1547
1548         ptr = (unsigned long)ei + item_offset;
1549         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1550         if (ptr < end - size)
1551                 memmove_extent_buffer(leaf, ptr + size, ptr,
1552                                       end - size - ptr);
1553
1554         iref = (struct btrfs_extent_inline_ref *)ptr;
1555         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1556         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1557                 struct btrfs_extent_data_ref *dref;
1558                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1559                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1560                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1561                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1562                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1563         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1564                 struct btrfs_shared_data_ref *sref;
1565                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1566                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1567                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1568         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1569                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1570         } else {
1571                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1572         }
1573         btrfs_mark_buffer_dirty(leaf);
1574         return 0;
1575 }
1576
1577 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1578                                  struct btrfs_root *root,
1579                                  struct btrfs_path *path,
1580                                  struct btrfs_extent_inline_ref **ref_ret,
1581                                  u64 bytenr, u64 num_bytes, u64 parent,
1582                                  u64 root_objectid, u64 owner, u64 offset)
1583 {
1584         int ret;
1585
1586         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1587                                            bytenr, num_bytes, parent,
1588                                            root_objectid, owner, offset, 0);
1589         if (ret != -ENOENT)
1590                 return ret;
1591
1592         btrfs_release_path(root, path);
1593         *ref_ret = NULL;
1594
1595         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1596                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1597                                             root_objectid);
1598         } else {
1599                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1600                                              root_objectid, owner, offset);
1601         }
1602         return ret;
1603 }
1604
1605 /*
1606  * helper to update/remove inline back ref
1607  */
1608 static noinline_for_stack
1609 int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1610                                  struct btrfs_root *root,
1611                                  struct btrfs_path *path,
1612                                  struct btrfs_extent_inline_ref *iref,
1613                                  int refs_to_mod,
1614                                  struct btrfs_delayed_extent_op *extent_op)
1615 {
1616         struct extent_buffer *leaf;
1617         struct btrfs_extent_item *ei;
1618         struct btrfs_extent_data_ref *dref = NULL;
1619         struct btrfs_shared_data_ref *sref = NULL;
1620         unsigned long ptr;
1621         unsigned long end;
1622         u32 item_size;
1623         int size;
1624         int type;
1625         int ret;
1626         u64 refs;
1627
1628         leaf = path->nodes[0];
1629         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1630         refs = btrfs_extent_refs(leaf, ei);
1631         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1632         refs += refs_to_mod;
1633         btrfs_set_extent_refs(leaf, ei, refs);
1634         if (extent_op)
1635                 __run_delayed_extent_op(extent_op, leaf, ei);
1636
1637         type = btrfs_extent_inline_ref_type(leaf, iref);
1638
1639         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1640                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1641                 refs = btrfs_extent_data_ref_count(leaf, dref);
1642         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1643                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1644                 refs = btrfs_shared_data_ref_count(leaf, sref);
1645         } else {
1646                 refs = 1;
1647                 BUG_ON(refs_to_mod != -1);
1648         }
1649
1650         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1651         refs += refs_to_mod;
1652
1653         if (refs > 0) {
1654                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1655                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1656                 else
1657                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1658         } else {
1659                 size =  btrfs_extent_inline_ref_size(type);
1660                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1661                 ptr = (unsigned long)iref;
1662                 end = (unsigned long)ei + item_size;
1663                 if (ptr + size < end)
1664                         memmove_extent_buffer(leaf, ptr, ptr + size,
1665                                               end - ptr - size);
1666                 item_size -= size;
1667                 ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1668                 BUG_ON(ret);
1669         }
1670         btrfs_mark_buffer_dirty(leaf);
1671         return 0;
1672 }
1673
1674 static noinline_for_stack
1675 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1676                                  struct btrfs_root *root,
1677                                  struct btrfs_path *path,
1678                                  u64 bytenr, u64 num_bytes, u64 parent,
1679                                  u64 root_objectid, u64 owner,
1680                                  u64 offset, int refs_to_add,
1681                                  struct btrfs_delayed_extent_op *extent_op)
1682 {
1683         struct btrfs_extent_inline_ref *iref;
1684         int ret;
1685
1686         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1687                                            bytenr, num_bytes, parent,
1688                                            root_objectid, owner, offset, 1);
1689         if (ret == 0) {
1690                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1691                 ret = update_inline_extent_backref(trans, root, path, iref,
1692                                                    refs_to_add, extent_op);
1693         } else if (ret == -ENOENT) {
1694                 ret = setup_inline_extent_backref(trans, root, path, iref,
1695                                                   parent, root_objectid,
1696                                                   owner, offset, refs_to_add,
1697                                                   extent_op);
1698         }
1699         return ret;
1700 }
1701
1702 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1703                                  struct btrfs_root *root,
1704                                  struct btrfs_path *path,
1705                                  u64 bytenr, u64 parent, u64 root_objectid,
1706                                  u64 owner, u64 offset, int refs_to_add)
1707 {
1708         int ret;
1709         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1710                 BUG_ON(refs_to_add != 1);
1711                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1712                                             parent, root_objectid);
1713         } else {
1714                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1715                                              parent, root_objectid,
1716                                              owner, offset, refs_to_add);
1717         }
1718         return ret;
1719 }
1720
1721 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1722                                  struct btrfs_root *root,
1723                                  struct btrfs_path *path,
1724                                  struct btrfs_extent_inline_ref *iref,
1725                                  int refs_to_drop, int is_data)
1726 {
1727         int ret;
1728
1729         BUG_ON(!is_data && refs_to_drop != 1);
1730         if (iref) {
1731                 ret = update_inline_extent_backref(trans, root, path, iref,
1732                                                    -refs_to_drop, NULL);
1733         } else if (is_data) {
1734                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1735         } else {
1736                 ret = btrfs_del_item(trans, root, path);
1737         }
1738         return ret;
1739 }
1740
1741 static int btrfs_issue_discard(struct block_device *bdev,
1742                                 u64 start, u64 len)
1743 {
1744         return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1745 }
1746
1747 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1748                                 u64 num_bytes, u64 *actual_bytes)
1749 {
1750         int ret;
1751         u64 discarded_bytes = 0;
1752         struct btrfs_multi_bio *multi = NULL;
1753
1754
1755         /* Tell the block device(s) that the sectors can be discarded */
1756         ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1757                               bytenr, &num_bytes, &multi, 0);
1758         if (!ret) {
1759                 struct btrfs_bio_stripe *stripe = multi->stripes;
1760                 int i;
1761
1762
1763                 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1764                         ret = btrfs_issue_discard(stripe->dev->bdev,
1765                                                   stripe->physical,
1766                                                   stripe->length);
1767                         if (!ret)
1768                                 discarded_bytes += stripe->length;
1769                         else if (ret != -EOPNOTSUPP)
1770                                 break;
1771                 }
1772                 kfree(multi);
1773         }
1774         if (discarded_bytes && ret == -EOPNOTSUPP)
1775                 ret = 0;
1776
1777         if (actual_bytes)
1778                 *actual_bytes = discarded_bytes;
1779
1780
1781         return ret;
1782 }
1783
1784 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1785                          struct btrfs_root *root,
1786                          u64 bytenr, u64 num_bytes, u64 parent,
1787                          u64 root_objectid, u64 owner, u64 offset)
1788 {
1789         int ret;
1790         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1791                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1792
1793         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1794                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1795                                         parent, root_objectid, (int)owner,
1796                                         BTRFS_ADD_DELAYED_REF, NULL);
1797         } else {
1798                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1799                                         parent, root_objectid, owner, offset,
1800                                         BTRFS_ADD_DELAYED_REF, NULL);
1801         }
1802         return ret;
1803 }
1804
1805 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1806                                   struct btrfs_root *root,
1807                                   u64 bytenr, u64 num_bytes,
1808                                   u64 parent, u64 root_objectid,
1809                                   u64 owner, u64 offset, int refs_to_add,
1810                                   struct btrfs_delayed_extent_op *extent_op)
1811 {
1812         struct btrfs_path *path;
1813         struct extent_buffer *leaf;
1814         struct btrfs_extent_item *item;
1815         u64 refs;
1816         int ret;
1817         int err = 0;
1818
1819         path = btrfs_alloc_path();
1820         if (!path)
1821                 return -ENOMEM;
1822
1823         path->reada = 1;
1824         path->leave_spinning = 1;
1825         /* this will setup the path even if it fails to insert the back ref */
1826         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1827                                            path, bytenr, num_bytes, parent,
1828                                            root_objectid, owner, offset,
1829                                            refs_to_add, extent_op);
1830         if (ret == 0)
1831                 goto out;
1832
1833         if (ret != -EAGAIN) {
1834                 err = ret;
1835                 goto out;
1836         }
1837
1838         leaf = path->nodes[0];
1839         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1840         refs = btrfs_extent_refs(leaf, item);
1841         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1842         if (extent_op)
1843                 __run_delayed_extent_op(extent_op, leaf, item);
1844
1845         btrfs_mark_buffer_dirty(leaf);
1846         btrfs_release_path(root->fs_info->extent_root, path);
1847
1848         path->reada = 1;
1849         path->leave_spinning = 1;
1850
1851         /* now insert the actual backref */
1852         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1853                                     path, bytenr, parent, root_objectid,
1854                                     owner, offset, refs_to_add);
1855         BUG_ON(ret);
1856 out:
1857         btrfs_free_path(path);
1858         return err;
1859 }
1860
1861 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1862                                 struct btrfs_root *root,
1863                                 struct btrfs_delayed_ref_node *node,
1864                                 struct btrfs_delayed_extent_op *extent_op,
1865                                 int insert_reserved)
1866 {
1867         int ret = 0;
1868         struct btrfs_delayed_data_ref *ref;
1869         struct btrfs_key ins;
1870         u64 parent = 0;
1871         u64 ref_root = 0;
1872         u64 flags = 0;
1873
1874         ins.objectid = node->bytenr;
1875         ins.offset = node->num_bytes;
1876         ins.type = BTRFS_EXTENT_ITEM_KEY;
1877
1878         ref = btrfs_delayed_node_to_data_ref(node);
1879         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1880                 parent = ref->parent;
1881         else
1882                 ref_root = ref->root;
1883
1884         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1885                 if (extent_op) {
1886                         BUG_ON(extent_op->update_key);
1887                         flags |= extent_op->flags_to_set;
1888                 }
1889                 ret = alloc_reserved_file_extent(trans, root,
1890                                                  parent, ref_root, flags,
1891                                                  ref->objectid, ref->offset,
1892                                                  &ins, node->ref_mod);
1893         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1894                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1895                                              node->num_bytes, parent,
1896                                              ref_root, ref->objectid,
1897                                              ref->offset, node->ref_mod,
1898                                              extent_op);
1899         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1900                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1901                                           node->num_bytes, parent,
1902                                           ref_root, ref->objectid,
1903                                           ref->offset, node->ref_mod,
1904                                           extent_op);
1905         } else {
1906                 BUG();
1907         }
1908         return ret;
1909 }
1910
1911 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1912                                     struct extent_buffer *leaf,
1913                                     struct btrfs_extent_item *ei)
1914 {
1915         u64 flags = btrfs_extent_flags(leaf, ei);
1916         if (extent_op->update_flags) {
1917                 flags |= extent_op->flags_to_set;
1918                 btrfs_set_extent_flags(leaf, ei, flags);
1919         }
1920
1921         if (extent_op->update_key) {
1922                 struct btrfs_tree_block_info *bi;
1923                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1924                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1925                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1926         }
1927 }
1928
1929 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1930                                  struct btrfs_root *root,
1931                                  struct btrfs_delayed_ref_node *node,
1932                                  struct btrfs_delayed_extent_op *extent_op)
1933 {
1934         struct btrfs_key key;
1935         struct btrfs_path *path;
1936         struct btrfs_extent_item *ei;
1937         struct extent_buffer *leaf;
1938         u32 item_size;
1939         int ret;
1940         int err = 0;
1941
1942         path = btrfs_alloc_path();
1943         if (!path)
1944                 return -ENOMEM;
1945
1946         key.objectid = node->bytenr;
1947         key.type = BTRFS_EXTENT_ITEM_KEY;
1948         key.offset = node->num_bytes;
1949
1950         path->reada = 1;
1951         path->leave_spinning = 1;
1952         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1953                                 path, 0, 1);
1954         if (ret < 0) {
1955                 err = ret;
1956                 goto out;
1957         }
1958         if (ret > 0) {
1959                 err = -EIO;
1960                 goto out;
1961         }
1962
1963         leaf = path->nodes[0];
1964         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1965 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1966         if (item_size < sizeof(*ei)) {
1967                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1968                                              path, (u64)-1, 0);
1969                 if (ret < 0) {
1970                         err = ret;
1971                         goto out;
1972                 }
1973                 leaf = path->nodes[0];
1974                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1975         }
1976 #endif
1977         BUG_ON(item_size < sizeof(*ei));
1978         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1979         __run_delayed_extent_op(extent_op, leaf, ei);
1980
1981         btrfs_mark_buffer_dirty(leaf);
1982 out:
1983         btrfs_free_path(path);
1984         return err;
1985 }
1986
1987 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1988                                 struct btrfs_root *root,
1989                                 struct btrfs_delayed_ref_node *node,
1990                                 struct btrfs_delayed_extent_op *extent_op,
1991                                 int insert_reserved)
1992 {
1993         int ret = 0;
1994         struct btrfs_delayed_tree_ref *ref;
1995         struct btrfs_key ins;
1996         u64 parent = 0;
1997         u64 ref_root = 0;
1998
1999         ins.objectid = node->bytenr;
2000         ins.offset = node->num_bytes;
2001         ins.type = BTRFS_EXTENT_ITEM_KEY;
2002
2003         ref = btrfs_delayed_node_to_tree_ref(node);
2004         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2005                 parent = ref->parent;
2006         else
2007                 ref_root = ref->root;
2008
2009         BUG_ON(node->ref_mod != 1);
2010         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2011                 BUG_ON(!extent_op || !extent_op->update_flags ||
2012                        !extent_op->update_key);
2013                 ret = alloc_reserved_tree_block(trans, root,
2014                                                 parent, ref_root,
2015                                                 extent_op->flags_to_set,
2016                                                 &extent_op->key,
2017                                                 ref->level, &ins);
2018         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2019                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2020                                              node->num_bytes, parent, ref_root,
2021                                              ref->level, 0, 1, extent_op);
2022         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2023                 ret = __btrfs_free_extent(trans, root, node->bytenr,
2024                                           node->num_bytes, parent, ref_root,
2025                                           ref->level, 0, 1, extent_op);
2026         } else {
2027                 BUG();
2028         }
2029         return ret;
2030 }
2031
2032 /* helper function to actually process a single delayed ref entry */
2033 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2034                                struct btrfs_root *root,
2035                                struct btrfs_delayed_ref_node *node,
2036                                struct btrfs_delayed_extent_op *extent_op,
2037                                int insert_reserved)
2038 {
2039         int ret;
2040         if (btrfs_delayed_ref_is_head(node)) {
2041                 struct btrfs_delayed_ref_head *head;
2042                 /*
2043                  * we've hit the end of the chain and we were supposed
2044                  * to insert this extent into the tree.  But, it got
2045                  * deleted before we ever needed to insert it, so all
2046                  * we have to do is clean up the accounting
2047                  */
2048                 BUG_ON(extent_op);
2049                 head = btrfs_delayed_node_to_head(node);
2050                 if (insert_reserved) {
2051                         btrfs_pin_extent(root, node->bytenr,
2052                                          node->num_bytes, 1);
2053                         if (head->is_data) {
2054                                 ret = btrfs_del_csums(trans, root,
2055                                                       node->bytenr,
2056                                                       node->num_bytes);
2057                                 BUG_ON(ret);
2058                         }
2059                 }
2060                 mutex_unlock(&head->mutex);
2061                 return 0;
2062         }
2063
2064         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2065             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2066                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2067                                            insert_reserved);
2068         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2069                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2070                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2071                                            insert_reserved);
2072         else
2073                 BUG();
2074         return ret;
2075 }
2076
2077 static noinline struct btrfs_delayed_ref_node *
2078 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2079 {
2080         struct rb_node *node;
2081         struct btrfs_delayed_ref_node *ref;
2082         int action = BTRFS_ADD_DELAYED_REF;
2083 again:
2084         /*
2085          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2086          * this prevents ref count from going down to zero when
2087          * there still are pending delayed ref.
2088          */
2089         node = rb_prev(&head->node.rb_node);
2090         while (1) {
2091                 if (!node)
2092                         break;
2093                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
2094                                 rb_node);
2095                 if (ref->bytenr != head->node.bytenr)
2096                         break;
2097                 if (ref->action == action)
2098                         return ref;
2099                 node = rb_prev(node);
2100         }
2101         if (action == BTRFS_ADD_DELAYED_REF) {
2102                 action = BTRFS_DROP_DELAYED_REF;
2103                 goto again;
2104         }
2105         return NULL;
2106 }
2107
2108 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2109                                        struct btrfs_root *root,
2110                                        struct list_head *cluster)
2111 {
2112         struct btrfs_delayed_ref_root *delayed_refs;
2113         struct btrfs_delayed_ref_node *ref;
2114         struct btrfs_delayed_ref_head *locked_ref = NULL;
2115         struct btrfs_delayed_extent_op *extent_op;
2116         int ret;
2117         int count = 0;
2118         int must_insert_reserved = 0;
2119
2120         delayed_refs = &trans->transaction->delayed_refs;
2121         while (1) {
2122                 if (!locked_ref) {
2123                         /* pick a new head ref from the cluster list */
2124                         if (list_empty(cluster))
2125                                 break;
2126
2127                         locked_ref = list_entry(cluster->next,
2128                                      struct btrfs_delayed_ref_head, cluster);
2129
2130                         /* grab the lock that says we are going to process
2131                          * all the refs for this head */
2132                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2133
2134                         /*
2135                          * we may have dropped the spin lock to get the head
2136                          * mutex lock, and that might have given someone else
2137                          * time to free the head.  If that's true, it has been
2138                          * removed from our list and we can move on.
2139                          */
2140                         if (ret == -EAGAIN) {
2141                                 locked_ref = NULL;
2142                                 count++;
2143                                 continue;
2144                         }
2145                 }
2146
2147                 /*
2148                  * record the must insert reserved flag before we
2149                  * drop the spin lock.
2150                  */
2151                 must_insert_reserved = locked_ref->must_insert_reserved;
2152                 locked_ref->must_insert_reserved = 0;
2153
2154                 extent_op = locked_ref->extent_op;
2155                 locked_ref->extent_op = NULL;
2156
2157                 /*
2158                  * locked_ref is the head node, so we have to go one
2159                  * node back for any delayed ref updates
2160                  */
2161                 ref = select_delayed_ref(locked_ref);
2162                 if (!ref) {
2163                         /* All delayed refs have been processed, Go ahead
2164                          * and send the head node to run_one_delayed_ref,
2165                          * so that any accounting fixes can happen
2166                          */
2167                         ref = &locked_ref->node;
2168
2169                         if (extent_op && must_insert_reserved) {
2170                                 kfree(extent_op);
2171                                 extent_op = NULL;
2172                         }
2173
2174                         if (extent_op) {
2175                                 spin_unlock(&delayed_refs->lock);
2176
2177                                 ret = run_delayed_extent_op(trans, root,
2178                                                             ref, extent_op);
2179                                 BUG_ON(ret);
2180                                 kfree(extent_op);
2181
2182                                 cond_resched();
2183                                 spin_lock(&delayed_refs->lock);
2184                                 continue;
2185                         }
2186
2187                         list_del_init(&locked_ref->cluster);
2188                         locked_ref = NULL;
2189                 }
2190
2191                 ref->in_tree = 0;
2192                 rb_erase(&ref->rb_node, &delayed_refs->root);
2193                 delayed_refs->num_entries--;
2194
2195                 spin_unlock(&delayed_refs->lock);
2196
2197                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2198                                           must_insert_reserved);
2199                 BUG_ON(ret);
2200
2201                 btrfs_put_delayed_ref(ref);
2202                 kfree(extent_op);
2203                 count++;
2204
2205                 cond_resched();
2206                 spin_lock(&delayed_refs->lock);
2207         }
2208         return count;
2209 }
2210
2211 /*
2212  * this starts processing the delayed reference count updates and
2213  * extent insertions we have queued up so far.  count can be
2214  * 0, which means to process everything in the tree at the start
2215  * of the run (but not newly added entries), or it can be some target
2216  * number you'd like to process.
2217  */
2218 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2219                            struct btrfs_root *root, unsigned long count)
2220 {
2221         struct rb_node *node;
2222         struct btrfs_delayed_ref_root *delayed_refs;
2223         struct btrfs_delayed_ref_node *ref;
2224         struct list_head cluster;
2225         int ret;
2226         int run_all = count == (unsigned long)-1;
2227         int run_most = 0;
2228
2229         if (root == root->fs_info->extent_root)
2230                 root = root->fs_info->tree_root;
2231
2232         delayed_refs = &trans->transaction->delayed_refs;
2233         INIT_LIST_HEAD(&cluster);
2234 again:
2235         spin_lock(&delayed_refs->lock);
2236         if (count == 0) {
2237                 count = delayed_refs->num_entries * 2;
2238                 run_most = 1;
2239         }
2240         while (1) {
2241                 if (!(run_all || run_most) &&
2242                     delayed_refs->num_heads_ready < 64)
2243                         break;
2244
2245                 /*
2246                  * go find something we can process in the rbtree.  We start at
2247                  * the beginning of the tree, and then build a cluster
2248                  * of refs to process starting at the first one we are able to
2249                  * lock
2250                  */
2251                 ret = btrfs_find_ref_cluster(trans, &cluster,
2252                                              delayed_refs->run_delayed_start);
2253                 if (ret)
2254                         break;
2255
2256                 ret = run_clustered_refs(trans, root, &cluster);
2257                 BUG_ON(ret < 0);
2258
2259                 count -= min_t(unsigned long, ret, count);
2260
2261                 if (count == 0)
2262                         break;
2263         }
2264
2265         if (run_all) {
2266                 node = rb_first(&delayed_refs->root);
2267                 if (!node)
2268                         goto out;
2269                 count = (unsigned long)-1;
2270
2271                 while (node) {
2272                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2273                                        rb_node);
2274                         if (btrfs_delayed_ref_is_head(ref)) {
2275                                 struct btrfs_delayed_ref_head *head;
2276
2277                                 head = btrfs_delayed_node_to_head(ref);
2278                                 atomic_inc(&ref->refs);
2279
2280                                 spin_unlock(&delayed_refs->lock);
2281                                 mutex_lock(&head->mutex);
2282                                 mutex_unlock(&head->mutex);
2283
2284                                 btrfs_put_delayed_ref(ref);
2285                                 cond_resched();
2286                                 goto again;
2287                         }
2288                         node = rb_next(node);
2289                 }
2290                 spin_unlock(&delayed_refs->lock);
2291                 schedule_timeout(1);
2292                 goto again;
2293         }
2294 out:
2295         spin_unlock(&delayed_refs->lock);
2296         return 0;
2297 }
2298
2299 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2300                                 struct btrfs_root *root,
2301                                 u64 bytenr, u64 num_bytes, u64 flags,
2302                                 int is_data)
2303 {
2304         struct btrfs_delayed_extent_op *extent_op;
2305         int ret;
2306
2307         extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2308         if (!extent_op)
2309                 return -ENOMEM;
2310
2311         extent_op->flags_to_set = flags;
2312         extent_op->update_flags = 1;
2313         extent_op->update_key = 0;
2314         extent_op->is_data = is_data ? 1 : 0;
2315
2316         ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2317         if (ret)
2318                 kfree(extent_op);
2319         return ret;
2320 }
2321
2322 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2323                                       struct btrfs_root *root,
2324                                       struct btrfs_path *path,
2325                                       u64 objectid, u64 offset, u64 bytenr)
2326 {
2327         struct btrfs_delayed_ref_head *head;
2328         struct btrfs_delayed_ref_node *ref;
2329         struct btrfs_delayed_data_ref *data_ref;
2330         struct btrfs_delayed_ref_root *delayed_refs;
2331         struct rb_node *node;
2332         int ret = 0;
2333
2334         ret = -ENOENT;
2335         delayed_refs = &trans->transaction->delayed_refs;
2336         spin_lock(&delayed_refs->lock);
2337         head = btrfs_find_delayed_ref_head(trans, bytenr);
2338         if (!head)
2339                 goto out;
2340
2341         if (!mutex_trylock(&head->mutex)) {
2342                 atomic_inc(&head->node.refs);
2343                 spin_unlock(&delayed_refs->lock);
2344
2345                 btrfs_release_path(root->fs_info->extent_root, path);
2346
2347                 mutex_lock(&head->mutex);
2348                 mutex_unlock(&head->mutex);
2349                 btrfs_put_delayed_ref(&head->node);
2350                 return -EAGAIN;
2351         }
2352
2353         node = rb_prev(&head->node.rb_node);
2354         if (!node)
2355                 goto out_unlock;
2356
2357         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2358
2359         if (ref->bytenr != bytenr)
2360                 goto out_unlock;
2361
2362         ret = 1;
2363         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2364                 goto out_unlock;
2365
2366         data_ref = btrfs_delayed_node_to_data_ref(ref);
2367
2368         node = rb_prev(node);
2369         if (node) {
2370                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2371                 if (ref->bytenr == bytenr)
2372                         goto out_unlock;
2373         }
2374
2375         if (data_ref->root != root->root_key.objectid ||
2376             data_ref->objectid != objectid || data_ref->offset != offset)
2377                 goto out_unlock;
2378
2379         ret = 0;
2380 out_unlock:
2381         mutex_unlock(&head->mutex);
2382 out:
2383         spin_unlock(&delayed_refs->lock);
2384         return ret;
2385 }
2386
2387 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2388                                         struct btrfs_root *root,
2389                                         struct btrfs_path *path,
2390                                         u64 objectid, u64 offset, u64 bytenr)
2391 {
2392         struct btrfs_root *extent_root = root->fs_info->extent_root;
2393         struct extent_buffer *leaf;
2394         struct btrfs_extent_data_ref *ref;
2395         struct btrfs_extent_inline_ref *iref;
2396         struct btrfs_extent_item *ei;
2397         struct btrfs_key key;
2398         u32 item_size;
2399         int ret;
2400
2401         key.objectid = bytenr;
2402         key.offset = (u64)-1;
2403         key.type = BTRFS_EXTENT_ITEM_KEY;
2404
2405         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2406         if (ret < 0)
2407                 goto out;
2408         BUG_ON(ret == 0);
2409
2410         ret = -ENOENT;
2411         if (path->slots[0] == 0)
2412                 goto out;
2413
2414         path->slots[0]--;
2415         leaf = path->nodes[0];
2416         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2417
2418         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2419                 goto out;
2420
2421         ret = 1;
2422         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2423 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2424         if (item_size < sizeof(*ei)) {
2425                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2426                 goto out;
2427         }
2428 #endif
2429         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2430
2431         if (item_size != sizeof(*ei) +
2432             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2433                 goto out;
2434
2435         if (btrfs_extent_generation(leaf, ei) <=
2436             btrfs_root_last_snapshot(&root->root_item))
2437                 goto out;
2438
2439         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2440         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2441             BTRFS_EXTENT_DATA_REF_KEY)
2442                 goto out;
2443
2444         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2445         if (btrfs_extent_refs(leaf, ei) !=
2446             btrfs_extent_data_ref_count(leaf, ref) ||
2447             btrfs_extent_data_ref_root(leaf, ref) !=
2448             root->root_key.objectid ||
2449             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2450             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2451                 goto out;
2452
2453         ret = 0;
2454 out:
2455         return ret;
2456 }
2457
2458 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2459                           struct btrfs_root *root,
2460                           u64 objectid, u64 offset, u64 bytenr)
2461 {
2462         struct btrfs_path *path;
2463         int ret;
2464         int ret2;
2465
2466         path = btrfs_alloc_path();
2467         if (!path)
2468                 return -ENOENT;
2469
2470         do {
2471                 ret = check_committed_ref(trans, root, path, objectid,
2472                                           offset, bytenr);
2473                 if (ret && ret != -ENOENT)
2474                         goto out;
2475
2476                 ret2 = check_delayed_ref(trans, root, path, objectid,
2477                                          offset, bytenr);
2478         } while (ret2 == -EAGAIN);
2479
2480         if (ret2 && ret2 != -ENOENT) {
2481                 ret = ret2;
2482                 goto out;
2483         }
2484
2485         if (ret != -ENOENT || ret2 != -ENOENT)
2486                 ret = 0;
2487 out:
2488         btrfs_free_path(path);
2489         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2490                 WARN_ON(ret > 0);
2491         return ret;
2492 }
2493
2494 #if 0
2495 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2496                     struct extent_buffer *buf, u32 nr_extents)
2497 {
2498         struct btrfs_key key;
2499         struct btrfs_file_extent_item *fi;
2500         u64 root_gen;
2501         u32 nritems;
2502         int i;
2503         int level;
2504         int ret = 0;
2505         int shared = 0;
2506
2507         if (!root->ref_cows)
2508                 return 0;
2509
2510         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2511                 shared = 0;
2512                 root_gen = root->root_key.offset;
2513         } else {
2514                 shared = 1;
2515                 root_gen = trans->transid - 1;
2516         }
2517
2518         level = btrfs_header_level(buf);
2519         nritems = btrfs_header_nritems(buf);
2520
2521         if (level == 0) {
2522                 struct btrfs_leaf_ref *ref;
2523                 struct btrfs_extent_info *info;
2524
2525                 ref = btrfs_alloc_leaf_ref(root, nr_extents);
2526                 if (!ref) {
2527                         ret = -ENOMEM;
2528                         goto out;
2529                 }
2530
2531                 ref->root_gen = root_gen;
2532                 ref->bytenr = buf->start;
2533                 ref->owner = btrfs_header_owner(buf);
2534                 ref->generation = btrfs_header_generation(buf);
2535                 ref->nritems = nr_extents;
2536                 info = ref->extents;
2537
2538                 for (i = 0; nr_extents > 0 && i < nritems; i++) {
2539                         u64 disk_bytenr;
2540                         btrfs_item_key_to_cpu(buf, &key, i);
2541                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2542                                 continue;
2543                         fi = btrfs_item_ptr(buf, i,
2544                                             struct btrfs_file_extent_item);
2545                         if (btrfs_file_extent_type(buf, fi) ==
2546                             BTRFS_FILE_EXTENT_INLINE)
2547                                 continue;
2548                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2549                         if (disk_bytenr == 0)
2550                                 continue;
2551
2552                         info->bytenr = disk_bytenr;
2553                         info->num_bytes =
2554                                 btrfs_file_extent_disk_num_bytes(buf, fi);
2555                         info->objectid = key.objectid;
2556                         info->offset = key.offset;
2557                         info++;
2558                 }
2559
2560                 ret = btrfs_add_leaf_ref(root, ref, shared);
2561                 if (ret == -EEXIST && shared) {
2562                         struct btrfs_leaf_ref *old;
2563                         old = btrfs_lookup_leaf_ref(root, ref->bytenr);
2564                         BUG_ON(!old);
2565                         btrfs_remove_leaf_ref(root, old);
2566                         btrfs_free_leaf_ref(root, old);
2567                         ret = btrfs_add_leaf_ref(root, ref, shared);
2568                 }
2569                 WARN_ON(ret);
2570                 btrfs_free_leaf_ref(root, ref);
2571         }
2572 out:
2573         return ret;
2574 }
2575
2576 /* when a block goes through cow, we update the reference counts of
2577  * everything that block points to.  The internal pointers of the block
2578  * can be in just about any order, and it is likely to have clusters of
2579  * things that are close together and clusters of things that are not.
2580  *
2581  * To help reduce the seeks that come with updating all of these reference
2582  * counts, sort them by byte number before actual updates are done.
2583  *
2584  * struct refsort is used to match byte number to slot in the btree block.
2585  * we sort based on the byte number and then use the slot to actually
2586  * find the item.
2587  *
2588  * struct refsort is smaller than strcut btrfs_item and smaller than
2589  * struct btrfs_key_ptr.  Since we're currently limited to the page size
2590  * for a btree block, there's no way for a kmalloc of refsorts for a
2591  * single node to be bigger than a page.
2592  */
2593 struct refsort {
2594         u64 bytenr;
2595         u32 slot;
2596 };
2597
2598 /*
2599  * for passing into sort()
2600  */
2601 static int refsort_cmp(const void *a_void, const void *b_void)
2602 {
2603         const struct refsort *a = a_void;
2604         const struct refsort *b = b_void;
2605
2606         if (a->bytenr < b->bytenr)
2607                 return -1;
2608         if (a->bytenr > b->bytenr)
2609                 return 1;
2610         return 0;
2611 }
2612 #endif
2613
2614 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2615                            struct btrfs_root *root,
2616                            struct extent_buffer *buf,
2617                            int full_backref, int inc)
2618 {
2619         u64 bytenr;
2620         u64 num_bytes;
2621         u64 parent;
2622         u64 ref_root;
2623         u32 nritems;
2624         struct btrfs_key key;
2625         struct btrfs_file_extent_item *fi;
2626         int i;
2627         int level;
2628         int ret = 0;
2629         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2630                             u64, u64, u64, u64, u64, u64);
2631
2632         ref_root = btrfs_header_owner(buf);
2633         nritems = btrfs_header_nritems(buf);
2634         level = btrfs_header_level(buf);
2635
2636         if (!root->ref_cows && level == 0)
2637                 return 0;
2638
2639         if (inc)
2640                 process_func = btrfs_inc_extent_ref;
2641         else
2642                 process_func = btrfs_free_extent;
2643
2644         if (full_backref)
2645                 parent = buf->start;
2646         else
2647                 parent = 0;
2648
2649         for (i = 0; i < nritems; i++) {
2650                 if (level == 0) {
2651                         btrfs_item_key_to_cpu(buf, &key, i);
2652                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2653                                 continue;
2654                         fi = btrfs_item_ptr(buf, i,
2655                                             struct btrfs_file_extent_item);
2656                         if (btrfs_file_extent_type(buf, fi) ==
2657                             BTRFS_FILE_EXTENT_INLINE)
2658                                 continue;
2659                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2660                         if (bytenr == 0)
2661                                 continue;
2662
2663                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2664                         key.offset -= btrfs_file_extent_offset(buf, fi);
2665                         ret = process_func(trans, root, bytenr, num_bytes,
2666                                            parent, ref_root, key.objectid,
2667                                            key.offset);
2668                         if (ret)
2669                                 goto fail;
2670                 } else {
2671                         bytenr = btrfs_node_blockptr(buf, i);
2672                         num_bytes = btrfs_level_size(root, level - 1);
2673                         ret = process_func(trans, root, bytenr, num_bytes,
2674                                            parent, ref_root, level - 1, 0);
2675                         if (ret)
2676                                 goto fail;
2677                 }
2678         }
2679         return 0;
2680 fail:
2681         BUG();
2682         return ret;
2683 }
2684
2685 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2686                   struct extent_buffer *buf, int full_backref)
2687 {
2688         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2689 }
2690
2691 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2692                   struct extent_buffer *buf, int full_backref)
2693 {
2694         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2695 }
2696
2697 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2698                                  struct btrfs_root *root,
2699                                  struct btrfs_path *path,
2700                                  struct btrfs_block_group_cache *cache)
2701 {
2702         int ret;
2703         struct btrfs_root *extent_root = root->fs_info->extent_root;
2704         unsigned long bi;
2705         struct extent_buffer *leaf;
2706
2707         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2708         if (ret < 0)
2709                 goto fail;
2710         BUG_ON(ret);
2711
2712         leaf = path->nodes[0];
2713         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2714         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2715         btrfs_mark_buffer_dirty(leaf);
2716         btrfs_release_path(extent_root, path);
2717 fail:
2718         if (ret)
2719                 return ret;
2720         return 0;
2721
2722 }
2723
2724 static struct btrfs_block_group_cache *
2725 next_block_group(struct btrfs_root *root,
2726                  struct btrfs_block_group_cache *cache)
2727 {
2728         struct rb_node *node;
2729         spin_lock(&root->fs_info->block_group_cache_lock);
2730         node = rb_next(&cache->cache_node);
2731         btrfs_put_block_group(cache);
2732         if (node) {
2733                 cache = rb_entry(node, struct btrfs_block_group_cache,
2734                                  cache_node);
2735                 btrfs_get_block_group(cache);
2736         } else
2737                 cache = NULL;
2738         spin_unlock(&root->fs_info->block_group_cache_lock);
2739         return cache;
2740 }
2741
2742 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2743                             struct btrfs_trans_handle *trans,
2744                             struct btrfs_path *path)
2745 {
2746         struct btrfs_root *root = block_group->fs_info->tree_root;
2747         struct inode *inode = NULL;
2748         u64 alloc_hint = 0;
2749         int dcs = BTRFS_DC_ERROR;
2750         int num_pages = 0;
2751         int retries = 0;
2752         int ret = 0;
2753
2754         /*
2755          * If this block group is smaller than 100 megs don't bother caching the
2756          * block group.
2757          */
2758         if (block_group->key.offset < (100 * 1024 * 1024)) {
2759                 spin_lock(&block_group->lock);
2760                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2761                 spin_unlock(&block_group->lock);
2762                 return 0;
2763         }
2764
2765 again:
2766         inode = lookup_free_space_inode(root, block_group, path);
2767         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2768                 ret = PTR_ERR(inode);
2769                 btrfs_release_path(root, path);
2770                 goto out;
2771         }
2772
2773         if (IS_ERR(inode)) {
2774                 BUG_ON(retries);
2775                 retries++;
2776
2777                 if (block_group->ro)
2778                         goto out_free;
2779
2780                 ret = create_free_space_inode(root, trans, block_group, path);
2781                 if (ret)
2782                         goto out_free;
2783                 goto again;
2784         }
2785
2786         /*
2787          * We want to set the generation to 0, that way if anything goes wrong
2788          * from here on out we know not to trust this cache when we load up next
2789          * time.
2790          */
2791         BTRFS_I(inode)->generation = 0;
2792         ret = btrfs_update_inode(trans, root, inode);
2793         WARN_ON(ret);
2794
2795         if (i_size_read(inode) > 0) {
2796                 ret = btrfs_truncate_free_space_cache(root, trans, path,
2797                                                       inode);
2798                 if (ret)
2799                         goto out_put;
2800         }
2801
2802         spin_lock(&block_group->lock);
2803         if (block_group->cached != BTRFS_CACHE_FINISHED) {
2804                 /* We're not cached, don't bother trying to write stuff out */
2805                 dcs = BTRFS_DC_WRITTEN;
2806                 spin_unlock(&block_group->lock);
2807                 goto out_put;
2808         }
2809         spin_unlock(&block_group->lock);
2810
2811         num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
2812         if (!num_pages)
2813                 num_pages = 1;
2814
2815         /*
2816          * Just to make absolutely sure we have enough space, we're going to
2817          * preallocate 12 pages worth of space for each block group.  In
2818          * practice we ought to use at most 8, but we need extra space so we can
2819          * add our header and have a terminator between the extents and the
2820          * bitmaps.
2821          */
2822         num_pages *= 16;
2823         num_pages *= PAGE_CACHE_SIZE;
2824
2825         ret = btrfs_check_data_free_space(inode, num_pages);
2826         if (ret)
2827                 goto out_put;
2828
2829         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2830                                               num_pages, num_pages,
2831                                               &alloc_hint);
2832         if (!ret)
2833                 dcs = BTRFS_DC_SETUP;
2834         btrfs_free_reserved_data_space(inode, num_pages);
2835 out_put:
2836         iput(inode);
2837 out_free:
2838         btrfs_release_path(root, path);
2839 out:
2840         spin_lock(&block_group->lock);
2841         block_group->disk_cache_state = dcs;
2842         spin_unlock(&block_group->lock);
2843
2844         return ret;
2845 }
2846
2847 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2848                                    struct btrfs_root *root)
2849 {
2850         struct btrfs_block_group_cache *cache;
2851         int err = 0;
2852         struct btrfs_path *path;
2853         u64 last = 0;
2854
2855         path = btrfs_alloc_path();
2856         if (!path)
2857                 return -ENOMEM;
2858
2859 again:
2860         while (1) {
2861                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2862                 while (cache) {
2863                         if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2864                                 break;
2865                         cache = next_block_group(root, cache);
2866                 }
2867                 if (!cache) {
2868                         if (last == 0)
2869                                 break;
2870                         last = 0;
2871                         continue;
2872                 }
2873                 err = cache_save_setup(cache, trans, path);
2874                 last = cache->key.objectid + cache->key.offset;
2875                 btrfs_put_block_group(cache);
2876         }
2877
2878         while (1) {
2879                 if (last == 0) {
2880                         err = btrfs_run_delayed_refs(trans, root,
2881                                                      (unsigned long)-1);
2882                         BUG_ON(err);
2883                 }
2884
2885                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2886                 while (cache) {
2887                         if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
2888                                 btrfs_put_block_group(cache);
2889                                 goto again;
2890                         }
2891
2892                         if (cache->dirty)
2893                                 break;
2894                         cache = next_block_group(root, cache);
2895                 }
2896                 if (!cache) {
2897                         if (last == 0)
2898                                 break;
2899                         last = 0;
2900                         continue;
2901                 }
2902
2903                 if (cache->disk_cache_state == BTRFS_DC_SETUP)
2904                         cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
2905                 cache->dirty = 0;
2906                 last = cache->key.objectid + cache->key.offset;
2907
2908                 err = write_one_cache_group(trans, root, path, cache);
2909                 BUG_ON(err);
2910                 btrfs_put_block_group(cache);
2911         }
2912
2913         while (1) {
2914                 /*
2915                  * I don't think this is needed since we're just marking our
2916                  * preallocated extent as written, but just in case it can't
2917                  * hurt.
2918                  */
2919                 if (last == 0) {
2920                         err = btrfs_run_delayed_refs(trans, root,
2921                                                      (unsigned long)-1);
2922                         BUG_ON(err);
2923                 }
2924
2925                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2926                 while (cache) {
2927                         /*
2928                          * Really this shouldn't happen, but it could if we
2929                          * couldn't write the entire preallocated extent and
2930                          * splitting the extent resulted in a new block.
2931                          */
2932                         if (cache->dirty) {
2933                                 btrfs_put_block_group(cache);
2934                                 goto again;
2935                         }
2936                         if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2937                                 break;
2938                         cache = next_block_group(root, cache);
2939                 }
2940                 if (!cache) {
2941                         if (last == 0)
2942                                 break;
2943                         last = 0;
2944                         continue;
2945                 }
2946
2947                 btrfs_write_out_cache(root, trans, cache, path);
2948
2949                 /*
2950                  * If we didn't have an error then the cache state is still
2951                  * NEED_WRITE, so we can set it to WRITTEN.
2952                  */
2953                 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2954                         cache->disk_cache_state = BTRFS_DC_WRITTEN;
2955                 last = cache->key.objectid + cache->key.offset;
2956                 btrfs_put_block_group(cache);
2957         }
2958
2959         btrfs_free_path(path);
2960         return 0;
2961 }
2962
2963 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2964 {
2965         struct btrfs_block_group_cache *block_group;
2966         int readonly = 0;
2967
2968         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2969         if (!block_group || block_group->ro)
2970                 readonly = 1;
2971         if (block_group)
2972                 btrfs_put_block_group(block_group);
2973         return readonly;
2974 }
2975
2976 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2977                              u64 total_bytes, u64 bytes_used,
2978                              struct btrfs_space_info **space_info)
2979 {
2980         struct btrfs_space_info *found;
2981         int i;
2982         int factor;
2983
2984         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2985                      BTRFS_BLOCK_GROUP_RAID10))
2986                 factor = 2;
2987         else
2988                 factor = 1;
2989
2990         found = __find_space_info(info, flags);
2991         if (found) {
2992                 spin_lock(&found->lock);
2993                 found->total_bytes += total_bytes;
2994                 found->disk_total += total_bytes * factor;
2995                 found->bytes_used += bytes_used;
2996                 found->disk_used += bytes_used * factor;
2997                 found->full = 0;
2998                 spin_unlock(&found->lock);
2999                 *space_info = found;
3000                 return 0;
3001         }
3002         found = kzalloc(sizeof(*found), GFP_NOFS);
3003         if (!found)
3004                 return -ENOMEM;
3005
3006         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3007                 INIT_LIST_HEAD(&found->block_groups[i]);
3008         init_rwsem(&found->groups_sem);
3009         spin_lock_init(&found->lock);
3010         found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
3011                                 BTRFS_BLOCK_GROUP_SYSTEM |
3012                                 BTRFS_BLOCK_GROUP_METADATA);
3013         found->total_bytes = total_bytes;
3014         found->disk_total = total_bytes * factor;
3015         found->bytes_used = bytes_used;
3016         found->disk_used = bytes_used * factor;
3017         found->bytes_pinned = 0;
3018         found->bytes_reserved = 0;
3019         found->bytes_readonly = 0;
3020         found->bytes_may_use = 0;
3021         found->full = 0;
3022         found->force_alloc = 0;
3023         *space_info = found;
3024         list_add_rcu(&found->list, &info->space_info);
3025         atomic_set(&found->caching_threads, 0);
3026         return 0;
3027 }
3028
3029 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3030 {
3031         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
3032                                    BTRFS_BLOCK_GROUP_RAID1 |
3033                                    BTRFS_BLOCK_GROUP_RAID10 |
3034                                    BTRFS_BLOCK_GROUP_DUP);
3035         if (extra_flags) {
3036                 if (flags & BTRFS_BLOCK_GROUP_DATA)
3037                         fs_info->avail_data_alloc_bits |= extra_flags;
3038                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3039                         fs_info->avail_metadata_alloc_bits |= extra_flags;
3040                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3041                         fs_info->avail_system_alloc_bits |= extra_flags;
3042         }
3043 }
3044
3045 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3046 {
3047         /*
3048          * we add in the count of missing devices because we want
3049          * to make sure that any RAID levels on a degraded FS
3050          * continue to be honored.
3051          */
3052         u64 num_devices = root->fs_info->fs_devices->rw_devices +
3053                 root->fs_info->fs_devices->missing_devices;
3054
3055         if (num_devices == 1)
3056                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3057         if (num_devices < 4)
3058                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3059
3060         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3061             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3062                       BTRFS_BLOCK_GROUP_RAID10))) {
3063                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
3064         }
3065
3066         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3067             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3068                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3069         }
3070
3071         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3072             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3073              (flags & BTRFS_BLOCK_GROUP_RAID10) |
3074              (flags & BTRFS_BLOCK_GROUP_DUP)))
3075                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3076         return flags;
3077 }
3078
3079 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3080 {
3081         if (flags & BTRFS_BLOCK_GROUP_DATA)
3082                 flags |= root->fs_info->avail_data_alloc_bits &
3083                          root->fs_info->data_alloc_profile;
3084         else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3085                 flags |= root->fs_info->avail_system_alloc_bits &
3086                          root->fs_info->system_alloc_profile;
3087         else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3088                 flags |= root->fs_info->avail_metadata_alloc_bits &
3089                          root->fs_info->metadata_alloc_profile;
3090         return btrfs_reduce_alloc_profile(root, flags);
3091 }
3092
3093 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3094 {
3095         u64 flags;
3096
3097         if (data)
3098                 flags = BTRFS_BLOCK_GROUP_DATA;
3099         else if (root == root->fs_info->chunk_root)
3100                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3101         else
3102                 flags = BTRFS_BLOCK_GROUP_METADATA;
3103
3104         return get_alloc_profile(root, flags);
3105 }
3106
3107 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3108 {
3109         BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3110                                                        BTRFS_BLOCK_GROUP_DATA);
3111 }
3112
3113 /*
3114  * This will check the space that the inode allocates from to make sure we have
3115  * enough space for bytes.
3116  */
3117 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3118 {
3119         struct btrfs_space_info *data_sinfo;
3120         struct btrfs_root *root = BTRFS_I(inode)->root;
3121         u64 used;
3122         int ret = 0, committed = 0, alloc_chunk = 1;
3123
3124         /* make sure bytes are sectorsize aligned */
3125         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3126
3127         if (root == root->fs_info->tree_root) {
3128                 alloc_chunk = 0;
3129                 committed = 1;
3130         }
3131
3132         data_sinfo = BTRFS_I(inode)->space_info;
3133         if (!data_sinfo)
3134                 goto alloc;
3135
3136 again:
3137         /* make sure we have enough space to handle the data first */
3138         spin_lock(&data_sinfo->lock);
3139         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3140                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3141                 data_sinfo->bytes_may_use;
3142
3143         if (used + bytes > data_sinfo->total_bytes) {
3144                 struct btrfs_trans_handle *trans;
3145
3146                 /*
3147                  * if we don't have enough free bytes in this space then we need
3148                  * to alloc a new chunk.
3149                  */
3150                 if (!data_sinfo->full && alloc_chunk) {
3151                         u64 alloc_target;
3152
3153                         data_sinfo->force_alloc = 1;
3154                         spin_unlock(&data_sinfo->lock);
3155 alloc:
3156                         alloc_target = btrfs_get_alloc_profile(root, 1);
3157                         trans = btrfs_join_transaction(root, 1);
3158                         if (IS_ERR(trans))
3159                                 return PTR_ERR(trans);
3160
3161                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3162                                              bytes + 2 * 1024 * 1024,
3163                                              alloc_target, 0);
3164                         btrfs_end_transaction(trans, root);
3165                         if (ret < 0) {
3166                                 if (ret != -ENOSPC)
3167                                         return ret;
3168                                 else
3169                                         goto commit_trans;
3170                         }
3171
3172                         if (!data_sinfo) {
3173                                 btrfs_set_inode_space_info(root, inode);
3174                                 data_sinfo = BTRFS_I(inode)->space_info;
3175                         }
3176                         goto again;
3177                 }
3178                 spin_unlock(&data_sinfo->lock);
3179
3180                 /* commit the current transaction and try again */
3181 commit_trans:
3182                 if (!committed && !root->fs_info->open_ioctl_trans) {
3183                         committed = 1;
3184                         trans = btrfs_join_transaction(root, 1);
3185                         if (IS_ERR(trans))
3186                                 return PTR_ERR(trans);
3187                         ret = btrfs_commit_transaction(trans, root);
3188                         if (ret)
3189                                 return ret;
3190                         goto again;
3191                 }
3192
3193 #if 0 /* I hope we never need this code again, just in case */
3194                 printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
3195                        "%llu bytes_reserved, " "%llu bytes_pinned, "
3196                        "%llu bytes_readonly, %llu may use %llu total\n",
3197                        (unsigned long long)bytes,
3198                        (unsigned long long)data_sinfo->bytes_used,
3199                        (unsigned long long)data_sinfo->bytes_reserved,
3200                        (unsigned long long)data_sinfo->bytes_pinned,
3201                        (unsigned long long)data_sinfo->bytes_readonly,
3202                        (unsigned long long)data_sinfo->bytes_may_use,
3203                        (unsigned long long)data_sinfo->total_bytes);
3204 #endif
3205                 return -ENOSPC;
3206         }
3207         data_sinfo->bytes_may_use += bytes;
3208         BTRFS_I(inode)->reserved_bytes += bytes;
3209         spin_unlock(&data_sinfo->lock);
3210
3211         return 0;
3212 }
3213
3214 /*
3215  * called when we are clearing an delalloc extent from the
3216  * inode's io_tree or there was an error for whatever reason
3217  * after calling btrfs_check_data_free_space
3218  */
3219 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3220 {
3221         struct btrfs_root *root = BTRFS_I(inode)->root;
3222         struct btrfs_space_info *data_sinfo;
3223
3224         /* make sure bytes are sectorsize aligned */
3225         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3226
3227         data_sinfo = BTRFS_I(inode)->space_info;
3228         spin_lock(&data_sinfo->lock);
3229         data_sinfo->bytes_may_use -= bytes;
3230         BTRFS_I(inode)->reserved_bytes -= bytes;
3231         spin_unlock(&data_sinfo->lock);
3232 }
3233
3234 static void force_metadata_allocation(struct btrfs_fs_info *info)
3235 {
3236         struct list_head *head = &info->space_info;
3237         struct btrfs_space_info *found;
3238
3239         rcu_read_lock();
3240         list_for_each_entry_rcu(found, head, list) {
3241                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3242                         found->force_alloc = 1;
3243         }
3244         rcu_read_unlock();
3245 }
3246
3247 static int should_alloc_chunk(struct btrfs_root *root,
3248                               struct btrfs_space_info *sinfo, u64 alloc_bytes)
3249 {
3250         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3251         u64 thresh;
3252
3253         if (sinfo->bytes_used + sinfo->bytes_reserved +
3254             alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3255                 return 0;
3256
3257         if (sinfo->bytes_used + sinfo->bytes_reserved +
3258             alloc_bytes < div_factor(num_bytes, 8))
3259                 return 0;
3260
3261         thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3262         thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
3263
3264         if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3265                 return 0;
3266
3267         return 1;
3268 }
3269
3270 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3271                           struct btrfs_root *extent_root, u64 alloc_bytes,
3272                           u64 flags, int force)
3273 {
3274         struct btrfs_space_info *space_info;
3275         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3276         int ret = 0;
3277
3278         mutex_lock(&fs_info->chunk_mutex);
3279
3280         flags = btrfs_reduce_alloc_profile(extent_root, flags);
3281
3282         space_info = __find_space_info(extent_root->fs_info, flags);
3283         if (!space_info) {
3284                 ret = update_space_info(extent_root->fs_info, flags,
3285                                         0, 0, &space_info);
3286                 BUG_ON(ret);
3287         }
3288         BUG_ON(!space_info);
3289
3290         spin_lock(&space_info->lock);
3291         if (space_info->force_alloc)
3292                 force = 1;
3293         if (space_info->full) {
3294                 spin_unlock(&space_info->lock);
3295                 goto out;
3296         }
3297
3298         if (!force && !should_alloc_chunk(extent_root, space_info,
3299                                           alloc_bytes)) {
3300                 spin_unlock(&space_info->lock);
3301                 goto out;
3302         }
3303         spin_unlock(&space_info->lock);
3304
3305         /*
3306          * If we have mixed data/metadata chunks we want to make sure we keep
3307          * allocating mixed chunks instead of individual chunks.
3308          */
3309         if (btrfs_mixed_space_info(space_info))
3310                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3311
3312         /*
3313          * if we're doing a data chunk, go ahead and make sure that
3314          * we keep a reasonable number of metadata chunks allocated in the
3315          * FS as well.
3316          */
3317         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3318                 fs_info->data_chunk_allocations++;
3319                 if (!(fs_info->data_chunk_allocations %
3320                       fs_info->metadata_ratio))
3321                         force_metadata_allocation(fs_info);
3322         }
3323
3324         ret = btrfs_alloc_chunk(trans, extent_root, flags);
3325         spin_lock(&space_info->lock);
3326         if (ret)
3327                 space_info->full = 1;
3328         else
3329                 ret = 1;
3330         space_info->force_alloc = 0;
3331         spin_unlock(&space_info->lock);
3332 out:
3333         mutex_unlock(&extent_root->fs_info->chunk_mutex);
3334         return ret;
3335 }
3336
3337 /*
3338  * shrink metadata reservation for delalloc
3339  */
3340 static int shrink_delalloc(struct btrfs_trans_handle *trans,
3341                            struct btrfs_root *root, u64 to_reclaim, int sync)
3342 {
3343         struct btrfs_block_rsv *block_rsv;
3344         struct btrfs_space_info *space_info;
3345         u64 reserved;
3346         u64 max_reclaim;
3347         u64 reclaimed = 0;
3348         long time_left;
3349         int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3350         int loops = 0;
3351         unsigned long progress;
3352
3353         block_rsv = &root->fs_info->delalloc_block_rsv;
3354         space_info = block_rsv->space_info;
3355
3356         smp_mb();
3357         reserved = space_info->bytes_reserved;
3358         progress = space_info->reservation_progress;
3359
3360         if (reserved == 0)
3361                 return 0;
3362
3363         max_reclaim = min(reserved, to_reclaim);
3364
3365         while (loops < 1024) {
3366                 /* have the flusher threads jump in and do some IO */
3367                 smp_mb();
3368                 nr_pages = min_t(unsigned long, nr_pages,
3369                        root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3370                 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3371
3372                 spin_lock(&space_info->lock);
3373                 if (reserved > space_info->bytes_reserved)
3374                         reclaimed += reserved - space_info->bytes_reserved;
3375                 reserved = space_info->bytes_reserved;
3376                 spin_unlock(&space_info->lock);
3377
3378                 loops++;
3379
3380                 if (reserved == 0 || reclaimed >= max_reclaim)
3381                         break;
3382
3383                 if (trans && trans->transaction->blocked)
3384                         return -EAGAIN;
3385
3386                 time_left = schedule_timeout_interruptible(1);
3387
3388                 /* We were interrupted, exit */
3389                 if (time_left)
3390                         break;
3391
3392                 /* we've kicked the IO a few times, if anything has been freed,
3393                  * exit.  There is no sense in looping here for a long time
3394                  * when we really need to commit the transaction, or there are
3395                  * just too many writers without enough free space
3396                  */
3397
3398                 if (loops > 3) {
3399                         smp_mb();
3400                         if (progress != space_info->reservation_progress)
3401                                 break;
3402                 }
3403
3404         }
3405         return reclaimed >= to_reclaim;
3406 }
3407
3408 /*
3409  * Retries tells us how many times we've called reserve_metadata_bytes.  The
3410  * idea is if this is the first call (retries == 0) then we will add to our
3411  * reserved count if we can't make the allocation in order to hold our place
3412  * while we go and try and free up space.  That way for retries > 1 we don't try
3413  * and add space, we just check to see if the amount of unused space is >= the
3414  * total space, meaning that our reservation is valid.
3415  *
3416  * However if we don't intend to retry this reservation, pass -1 as retries so
3417  * that it short circuits this logic.
3418  */
3419 static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
3420                                   struct btrfs_root *root,
3421                                   struct btrfs_block_rsv *block_rsv,
3422                                   u64 orig_bytes, int flush)
3423 {
3424         struct btrfs_space_info *space_info = block_rsv->space_info;
3425         u64 unused;
3426         u64 num_bytes = orig_bytes;
3427         int retries = 0;
3428         int ret = 0;
3429         bool reserved = false;
3430         bool committed = false;
3431
3432 again:
3433         ret = -ENOSPC;
3434         if (reserved)
3435                 num_bytes = 0;
3436
3437         spin_lock(&space_info->lock);
3438         unused = space_info->bytes_used + space_info->bytes_reserved +
3439                  space_info->bytes_pinned + space_info->bytes_readonly +
3440                  space_info->bytes_may_use;
3441
3442         /*
3443          * The idea here is that we've not already over-reserved the block group
3444          * then we can go ahead and save our reservation first and then start
3445          * flushing if we need to.  Otherwise if we've already overcommitted
3446          * lets start flushing stuff first and then come back and try to make
3447          * our reservation.
3448          */
3449         if (unused <= space_info->total_bytes) {
3450                 unused = space_info->total_bytes - unused;
3451                 if (unused >= num_bytes) {
3452                         if (!reserved)
3453                                 space_info->bytes_reserved += orig_bytes;
3454                         ret = 0;
3455                 } else {
3456                         /*
3457                          * Ok set num_bytes to orig_bytes since we aren't
3458                          * overocmmitted, this way we only try and reclaim what
3459                          * we need.
3460                          */
3461                         num_bytes = orig_bytes;
3462                 }
3463         } else {
3464                 /*
3465                  * Ok we're over committed, set num_bytes to the overcommitted
3466                  * amount plus the amount of bytes that we need for this
3467                  * reservation.
3468                  */
3469                 num_bytes = unused - space_info->total_bytes +
3470                         (orig_bytes * (retries + 1));
3471         }
3472
3473         /*
3474          * Couldn't make our reservation, save our place so while we're trying
3475          * to reclaim space we can actually use it instead of somebody else
3476          * stealing it from us.
3477          */
3478         if (ret && !reserved) {
3479                 space_info->bytes_reserved += orig_bytes;
3480                 reserved = true;
3481         }
3482
3483         spin_unlock(&space_info->lock);
3484
3485         if (!ret)
3486                 return 0;
3487
3488         if (!flush)
3489                 goto out;