[CPUFREQ] Remove cpufreq_stats sysfs entries on module unload.
[pandora-kernel.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include "ctree.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "print-tree.h"
25 #include "locking.h"
26
27 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
28                       *root, struct btrfs_path *path, int level);
29 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
30                       *root, struct btrfs_key *ins_key,
31                       struct btrfs_path *path, int data_size, int extend);
32 static int push_node_left(struct btrfs_trans_handle *trans,
33                           struct btrfs_root *root, struct extent_buffer *dst,
34                           struct extent_buffer *src, int empty);
35 static int balance_node_right(struct btrfs_trans_handle *trans,
36                               struct btrfs_root *root,
37                               struct extent_buffer *dst_buf,
38                               struct extent_buffer *src_buf);
39 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
40                    struct btrfs_path *path, int level, int slot);
41
42 struct btrfs_path *btrfs_alloc_path(void)
43 {
44         struct btrfs_path *path;
45         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
46         return path;
47 }
48
49 /*
50  * set all locked nodes in the path to blocking locks.  This should
51  * be done before scheduling
52  */
53 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
54 {
55         int i;
56         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
57                 if (p->nodes[i] && p->locks[i])
58                         btrfs_set_lock_blocking(p->nodes[i]);
59         }
60 }
61
62 /*
63  * reset all the locked nodes in the patch to spinning locks.
64  *
65  * held is used to keep lockdep happy, when lockdep is enabled
66  * we set held to a blocking lock before we go around and
67  * retake all the spinlocks in the path.  You can safely use NULL
68  * for held
69  */
70 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
71                                         struct extent_buffer *held)
72 {
73         int i;
74
75 #ifdef CONFIG_DEBUG_LOCK_ALLOC
76         /* lockdep really cares that we take all of these spinlocks
77          * in the right order.  If any of the locks in the path are not
78          * currently blocking, it is going to complain.  So, make really
79          * really sure by forcing the path to blocking before we clear
80          * the path blocking.
81          */
82         if (held)
83                 btrfs_set_lock_blocking(held);
84         btrfs_set_path_blocking(p);
85 #endif
86
87         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
88                 if (p->nodes[i] && p->locks[i])
89                         btrfs_clear_lock_blocking(p->nodes[i]);
90         }
91
92 #ifdef CONFIG_DEBUG_LOCK_ALLOC
93         if (held)
94                 btrfs_clear_lock_blocking(held);
95 #endif
96 }
97
98 /* this also releases the path */
99 void btrfs_free_path(struct btrfs_path *p)
100 {
101         if (!p)
102                 return;
103         btrfs_release_path(p);
104         kmem_cache_free(btrfs_path_cachep, p);
105 }
106
107 /*
108  * path release drops references on the extent buffers in the path
109  * and it drops any locks held by this path
110  *
111  * It is safe to call this on paths that no locks or extent buffers held.
112  */
113 noinline void btrfs_release_path(struct btrfs_path *p)
114 {
115         int i;
116
117         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
118                 p->slots[i] = 0;
119                 if (!p->nodes[i])
120                         continue;
121                 if (p->locks[i]) {
122                         btrfs_tree_unlock(p->nodes[i]);
123                         p->locks[i] = 0;
124                 }
125                 free_extent_buffer(p->nodes[i]);
126                 p->nodes[i] = NULL;
127         }
128 }
129
130 /*
131  * safely gets a reference on the root node of a tree.  A lock
132  * is not taken, so a concurrent writer may put a different node
133  * at the root of the tree.  See btrfs_lock_root_node for the
134  * looping required.
135  *
136  * The extent buffer returned by this has a reference taken, so
137  * it won't disappear.  It may stop being the root of the tree
138  * at any time because there are no locks held.
139  */
140 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
141 {
142         struct extent_buffer *eb;
143
144         rcu_read_lock();
145         eb = rcu_dereference(root->node);
146         extent_buffer_get(eb);
147         rcu_read_unlock();
148         return eb;
149 }
150
151 /* loop around taking references on and locking the root node of the
152  * tree until you end up with a lock on the root.  A locked buffer
153  * is returned, with a reference held.
154  */
155 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
156 {
157         struct extent_buffer *eb;
158
159         while (1) {
160                 eb = btrfs_root_node(root);
161                 btrfs_tree_lock(eb);
162                 if (eb == root->node)
163                         break;
164                 btrfs_tree_unlock(eb);
165                 free_extent_buffer(eb);
166         }
167         return eb;
168 }
169
170 /* cowonly root (everything not a reference counted cow subvolume), just get
171  * put onto a simple dirty list.  transaction.c walks this to make sure they
172  * get properly updated on disk.
173  */
174 static void add_root_to_dirty_list(struct btrfs_root *root)
175 {
176         if (root->track_dirty && list_empty(&root->dirty_list)) {
177                 list_add(&root->dirty_list,
178                          &root->fs_info->dirty_cowonly_roots);
179         }
180 }
181
182 /*
183  * used by snapshot creation to make a copy of a root for a tree with
184  * a given objectid.  The buffer with the new root node is returned in
185  * cow_ret, and this func returns zero on success or a negative error code.
186  */
187 int btrfs_copy_root(struct btrfs_trans_handle *trans,
188                       struct btrfs_root *root,
189                       struct extent_buffer *buf,
190                       struct extent_buffer **cow_ret, u64 new_root_objectid)
191 {
192         struct extent_buffer *cow;
193         int ret = 0;
194         int level;
195         struct btrfs_disk_key disk_key;
196
197         WARN_ON(root->ref_cows && trans->transid !=
198                 root->fs_info->running_transaction->transid);
199         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
200
201         level = btrfs_header_level(buf);
202         if (level == 0)
203                 btrfs_item_key(buf, &disk_key, 0);
204         else
205                 btrfs_node_key(buf, &disk_key, 0);
206
207         cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
208                                      new_root_objectid, &disk_key, level,
209                                      buf->start, 0);
210         if (IS_ERR(cow))
211                 return PTR_ERR(cow);
212
213         copy_extent_buffer(cow, buf, 0, 0, cow->len);
214         btrfs_set_header_bytenr(cow, cow->start);
215         btrfs_set_header_generation(cow, trans->transid);
216         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
217         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
218                                      BTRFS_HEADER_FLAG_RELOC);
219         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
220                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
221         else
222                 btrfs_set_header_owner(cow, new_root_objectid);
223
224         write_extent_buffer(cow, root->fs_info->fsid,
225                             (unsigned long)btrfs_header_fsid(cow),
226                             BTRFS_FSID_SIZE);
227
228         WARN_ON(btrfs_header_generation(buf) > trans->transid);
229         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
230                 ret = btrfs_inc_ref(trans, root, cow, 1);
231         else
232                 ret = btrfs_inc_ref(trans, root, cow, 0);
233
234         if (ret)
235                 return ret;
236
237         btrfs_mark_buffer_dirty(cow);
238         *cow_ret = cow;
239         return 0;
240 }
241
242 /*
243  * check if the tree block can be shared by multiple trees
244  */
245 int btrfs_block_can_be_shared(struct btrfs_root *root,
246                               struct extent_buffer *buf)
247 {
248         /*
249          * Tree blocks not in refernece counted trees and tree roots
250          * are never shared. If a block was allocated after the last
251          * snapshot and the block was not allocated by tree relocation,
252          * we know the block is not shared.
253          */
254         if (root->ref_cows &&
255             buf != root->node && buf != root->commit_root &&
256             (btrfs_header_generation(buf) <=
257              btrfs_root_last_snapshot(&root->root_item) ||
258              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
259                 return 1;
260 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
261         if (root->ref_cows &&
262             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
263                 return 1;
264 #endif
265         return 0;
266 }
267
268 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
269                                        struct btrfs_root *root,
270                                        struct extent_buffer *buf,
271                                        struct extent_buffer *cow,
272                                        int *last_ref)
273 {
274         u64 refs;
275         u64 owner;
276         u64 flags;
277         u64 new_flags = 0;
278         int ret;
279
280         /*
281          * Backrefs update rules:
282          *
283          * Always use full backrefs for extent pointers in tree block
284          * allocated by tree relocation.
285          *
286          * If a shared tree block is no longer referenced by its owner
287          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
288          * use full backrefs for extent pointers in tree block.
289          *
290          * If a tree block is been relocating
291          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
292          * use full backrefs for extent pointers in tree block.
293          * The reason for this is some operations (such as drop tree)
294          * are only allowed for blocks use full backrefs.
295          */
296
297         if (btrfs_block_can_be_shared(root, buf)) {
298                 ret = btrfs_lookup_extent_info(trans, root, buf->start,
299                                                buf->len, &refs, &flags);
300                 BUG_ON(ret);
301                 BUG_ON(refs == 0);
302         } else {
303                 refs = 1;
304                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
305                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
306                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
307                 else
308                         flags = 0;
309         }
310
311         owner = btrfs_header_owner(buf);
312         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
313                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
314
315         if (refs > 1) {
316                 if ((owner == root->root_key.objectid ||
317                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
318                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
319                         ret = btrfs_inc_ref(trans, root, buf, 1);
320                         BUG_ON(ret);
321
322                         if (root->root_key.objectid ==
323                             BTRFS_TREE_RELOC_OBJECTID) {
324                                 ret = btrfs_dec_ref(trans, root, buf, 0);
325                                 BUG_ON(ret);
326                                 ret = btrfs_inc_ref(trans, root, cow, 1);
327                                 BUG_ON(ret);
328                         }
329                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
330                 } else {
331
332                         if (root->root_key.objectid ==
333                             BTRFS_TREE_RELOC_OBJECTID)
334                                 ret = btrfs_inc_ref(trans, root, cow, 1);
335                         else
336                                 ret = btrfs_inc_ref(trans, root, cow, 0);
337                         BUG_ON(ret);
338                 }
339                 if (new_flags != 0) {
340                         ret = btrfs_set_disk_extent_flags(trans, root,
341                                                           buf->start,
342                                                           buf->len,
343                                                           new_flags, 0);
344                         BUG_ON(ret);
345                 }
346         } else {
347                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
348                         if (root->root_key.objectid ==
349                             BTRFS_TREE_RELOC_OBJECTID)
350                                 ret = btrfs_inc_ref(trans, root, cow, 1);
351                         else
352                                 ret = btrfs_inc_ref(trans, root, cow, 0);
353                         BUG_ON(ret);
354                         ret = btrfs_dec_ref(trans, root, buf, 1);
355                         BUG_ON(ret);
356                 }
357                 clean_tree_block(trans, root, buf);
358                 *last_ref = 1;
359         }
360         return 0;
361 }
362
363 /*
364  * does the dirty work in cow of a single block.  The parent block (if
365  * supplied) is updated to point to the new cow copy.  The new buffer is marked
366  * dirty and returned locked.  If you modify the block it needs to be marked
367  * dirty again.
368  *
369  * search_start -- an allocation hint for the new block
370  *
371  * empty_size -- a hint that you plan on doing more cow.  This is the size in
372  * bytes the allocator should try to find free next to the block it returns.
373  * This is just a hint and may be ignored by the allocator.
374  */
375 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
376                              struct btrfs_root *root,
377                              struct extent_buffer *buf,
378                              struct extent_buffer *parent, int parent_slot,
379                              struct extent_buffer **cow_ret,
380                              u64 search_start, u64 empty_size)
381 {
382         struct btrfs_disk_key disk_key;
383         struct extent_buffer *cow;
384         int level;
385         int last_ref = 0;
386         int unlock_orig = 0;
387         u64 parent_start;
388
389         if (*cow_ret == buf)
390                 unlock_orig = 1;
391
392         btrfs_assert_tree_locked(buf);
393
394         WARN_ON(root->ref_cows && trans->transid !=
395                 root->fs_info->running_transaction->transid);
396         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
397
398         level = btrfs_header_level(buf);
399
400         if (level == 0)
401                 btrfs_item_key(buf, &disk_key, 0);
402         else
403                 btrfs_node_key(buf, &disk_key, 0);
404
405         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
406                 if (parent)
407                         parent_start = parent->start;
408                 else
409                         parent_start = 0;
410         } else
411                 parent_start = 0;
412
413         cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
414                                      root->root_key.objectid, &disk_key,
415                                      level, search_start, empty_size);
416         if (IS_ERR(cow))
417                 return PTR_ERR(cow);
418
419         /* cow is set to blocking by btrfs_init_new_buffer */
420
421         copy_extent_buffer(cow, buf, 0, 0, cow->len);
422         btrfs_set_header_bytenr(cow, cow->start);
423         btrfs_set_header_generation(cow, trans->transid);
424         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
425         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
426                                      BTRFS_HEADER_FLAG_RELOC);
427         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
428                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
429         else
430                 btrfs_set_header_owner(cow, root->root_key.objectid);
431
432         write_extent_buffer(cow, root->fs_info->fsid,
433                             (unsigned long)btrfs_header_fsid(cow),
434                             BTRFS_FSID_SIZE);
435
436         update_ref_for_cow(trans, root, buf, cow, &last_ref);
437
438         if (root->ref_cows)
439                 btrfs_reloc_cow_block(trans, root, buf, cow);
440
441         if (buf == root->node) {
442                 WARN_ON(parent && parent != buf);
443                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
444                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
445                         parent_start = buf->start;
446                 else
447                         parent_start = 0;
448
449                 extent_buffer_get(cow);
450                 rcu_assign_pointer(root->node, cow);
451
452                 btrfs_free_tree_block(trans, root, buf, parent_start,
453                                       last_ref);
454                 free_extent_buffer(buf);
455                 add_root_to_dirty_list(root);
456         } else {
457                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
458                         parent_start = parent->start;
459                 else
460                         parent_start = 0;
461
462                 WARN_ON(trans->transid != btrfs_header_generation(parent));
463                 btrfs_set_node_blockptr(parent, parent_slot,
464                                         cow->start);
465                 btrfs_set_node_ptr_generation(parent, parent_slot,
466                                               trans->transid);
467                 btrfs_mark_buffer_dirty(parent);
468                 btrfs_free_tree_block(trans, root, buf, parent_start,
469                                       last_ref);
470         }
471         if (unlock_orig)
472                 btrfs_tree_unlock(buf);
473         free_extent_buffer(buf);
474         btrfs_mark_buffer_dirty(cow);
475         *cow_ret = cow;
476         return 0;
477 }
478
479 static inline int should_cow_block(struct btrfs_trans_handle *trans,
480                                    struct btrfs_root *root,
481                                    struct extent_buffer *buf)
482 {
483         if (btrfs_header_generation(buf) == trans->transid &&
484             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
485             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
486               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
487                 return 0;
488         return 1;
489 }
490
491 /*
492  * cows a single block, see __btrfs_cow_block for the real work.
493  * This version of it has extra checks so that a block isn't cow'd more than
494  * once per transaction, as long as it hasn't been written yet
495  */
496 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
497                     struct btrfs_root *root, struct extent_buffer *buf,
498                     struct extent_buffer *parent, int parent_slot,
499                     struct extent_buffer **cow_ret)
500 {
501         u64 search_start;
502         int ret;
503
504         if (trans->transaction != root->fs_info->running_transaction) {
505                 printk(KERN_CRIT "trans %llu running %llu\n",
506                        (unsigned long long)trans->transid,
507                        (unsigned long long)
508                        root->fs_info->running_transaction->transid);
509                 WARN_ON(1);
510         }
511         if (trans->transid != root->fs_info->generation) {
512                 printk(KERN_CRIT "trans %llu running %llu\n",
513                        (unsigned long long)trans->transid,
514                        (unsigned long long)root->fs_info->generation);
515                 WARN_ON(1);
516         }
517
518         if (!should_cow_block(trans, root, buf)) {
519                 *cow_ret = buf;
520                 return 0;
521         }
522
523         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
524
525         if (parent)
526                 btrfs_set_lock_blocking(parent);
527         btrfs_set_lock_blocking(buf);
528
529         ret = __btrfs_cow_block(trans, root, buf, parent,
530                                  parent_slot, cow_ret, search_start, 0);
531
532         trace_btrfs_cow_block(root, buf, *cow_ret);
533
534         return ret;
535 }
536
537 /*
538  * helper function for defrag to decide if two blocks pointed to by a
539  * node are actually close by
540  */
541 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
542 {
543         if (blocknr < other && other - (blocknr + blocksize) < 32768)
544                 return 1;
545         if (blocknr > other && blocknr - (other + blocksize) < 32768)
546                 return 1;
547         return 0;
548 }
549
550 /*
551  * compare two keys in a memcmp fashion
552  */
553 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
554 {
555         struct btrfs_key k1;
556
557         btrfs_disk_key_to_cpu(&k1, disk);
558
559         return btrfs_comp_cpu_keys(&k1, k2);
560 }
561
562 /*
563  * same as comp_keys only with two btrfs_key's
564  */
565 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
566 {
567         if (k1->objectid > k2->objectid)
568                 return 1;
569         if (k1->objectid < k2->objectid)
570                 return -1;
571         if (k1->type > k2->type)
572                 return 1;
573         if (k1->type < k2->type)
574                 return -1;
575         if (k1->offset > k2->offset)
576                 return 1;
577         if (k1->offset < k2->offset)
578                 return -1;
579         return 0;
580 }
581
582 /*
583  * this is used by the defrag code to go through all the
584  * leaves pointed to by a node and reallocate them so that
585  * disk order is close to key order
586  */
587 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
588                        struct btrfs_root *root, struct extent_buffer *parent,
589                        int start_slot, int cache_only, u64 *last_ret,
590                        struct btrfs_key *progress)
591 {
592         struct extent_buffer *cur;
593         u64 blocknr;
594         u64 gen;
595         u64 search_start = *last_ret;
596         u64 last_block = 0;
597         u64 other;
598         u32 parent_nritems;
599         int end_slot;
600         int i;
601         int err = 0;
602         int parent_level;
603         int uptodate;
604         u32 blocksize;
605         int progress_passed = 0;
606         struct btrfs_disk_key disk_key;
607
608         parent_level = btrfs_header_level(parent);
609         if (cache_only && parent_level != 1)
610                 return 0;
611
612         if (trans->transaction != root->fs_info->running_transaction)
613                 WARN_ON(1);
614         if (trans->transid != root->fs_info->generation)
615                 WARN_ON(1);
616
617         parent_nritems = btrfs_header_nritems(parent);
618         blocksize = btrfs_level_size(root, parent_level - 1);
619         end_slot = parent_nritems;
620
621         if (parent_nritems == 1)
622                 return 0;
623
624         btrfs_set_lock_blocking(parent);
625
626         for (i = start_slot; i < end_slot; i++) {
627                 int close = 1;
628
629                 if (!parent->map_token) {
630                         map_extent_buffer(parent,
631                                         btrfs_node_key_ptr_offset(i),
632                                         sizeof(struct btrfs_key_ptr),
633                                         &parent->map_token, &parent->kaddr,
634                                         &parent->map_start, &parent->map_len,
635                                         KM_USER1);
636                 }
637                 btrfs_node_key(parent, &disk_key, i);
638                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
639                         continue;
640
641                 progress_passed = 1;
642                 blocknr = btrfs_node_blockptr(parent, i);
643                 gen = btrfs_node_ptr_generation(parent, i);
644                 if (last_block == 0)
645                         last_block = blocknr;
646
647                 if (i > 0) {
648                         other = btrfs_node_blockptr(parent, i - 1);
649                         close = close_blocks(blocknr, other, blocksize);
650                 }
651                 if (!close && i < end_slot - 2) {
652                         other = btrfs_node_blockptr(parent, i + 1);
653                         close = close_blocks(blocknr, other, blocksize);
654                 }
655                 if (close) {
656                         last_block = blocknr;
657                         continue;
658                 }
659                 if (parent->map_token) {
660                         unmap_extent_buffer(parent, parent->map_token,
661                                             KM_USER1);
662                         parent->map_token = NULL;
663                 }
664
665                 cur = btrfs_find_tree_block(root, blocknr, blocksize);
666                 if (cur)
667                         uptodate = btrfs_buffer_uptodate(cur, gen);
668                 else
669                         uptodate = 0;
670                 if (!cur || !uptodate) {
671                         if (cache_only) {
672                                 free_extent_buffer(cur);
673                                 continue;
674                         }
675                         if (!cur) {
676                                 cur = read_tree_block(root, blocknr,
677                                                          blocksize, gen);
678                                 if (!cur)
679                                         return -EIO;
680                         } else if (!uptodate) {
681                                 btrfs_read_buffer(cur, gen);
682                         }
683                 }
684                 if (search_start == 0)
685                         search_start = last_block;
686
687                 btrfs_tree_lock(cur);
688                 btrfs_set_lock_blocking(cur);
689                 err = __btrfs_cow_block(trans, root, cur, parent, i,
690                                         &cur, search_start,
691                                         min(16 * blocksize,
692                                             (end_slot - i) * blocksize));
693                 if (err) {
694                         btrfs_tree_unlock(cur);
695                         free_extent_buffer(cur);
696                         break;
697                 }
698                 search_start = cur->start;
699                 last_block = cur->start;
700                 *last_ret = search_start;
701                 btrfs_tree_unlock(cur);
702                 free_extent_buffer(cur);
703         }
704         if (parent->map_token) {
705                 unmap_extent_buffer(parent, parent->map_token,
706                                     KM_USER1);
707                 parent->map_token = NULL;
708         }
709         return err;
710 }
711
712 /*
713  * The leaf data grows from end-to-front in the node.
714  * this returns the address of the start of the last item,
715  * which is the stop of the leaf data stack
716  */
717 static inline unsigned int leaf_data_end(struct btrfs_root *root,
718                                          struct extent_buffer *leaf)
719 {
720         u32 nr = btrfs_header_nritems(leaf);
721         if (nr == 0)
722                 return BTRFS_LEAF_DATA_SIZE(root);
723         return btrfs_item_offset_nr(leaf, nr - 1);
724 }
725
726
727 /*
728  * search for key in the extent_buffer.  The items start at offset p,
729  * and they are item_size apart.  There are 'max' items in p.
730  *
731  * the slot in the array is returned via slot, and it points to
732  * the place where you would insert key if it is not found in
733  * the array.
734  *
735  * slot may point to max if the key is bigger than all of the keys
736  */
737 static noinline int generic_bin_search(struct extent_buffer *eb,
738                                        unsigned long p,
739                                        int item_size, struct btrfs_key *key,
740                                        int max, int *slot)
741 {
742         int low = 0;
743         int high = max;
744         int mid;
745         int ret;
746         struct btrfs_disk_key *tmp = NULL;
747         struct btrfs_disk_key unaligned;
748         unsigned long offset;
749         char *map_token = NULL;
750         char *kaddr = NULL;
751         unsigned long map_start = 0;
752         unsigned long map_len = 0;
753         int err;
754
755         while (low < high) {
756                 mid = (low + high) / 2;
757                 offset = p + mid * item_size;
758
759                 if (!map_token || offset < map_start ||
760                     (offset + sizeof(struct btrfs_disk_key)) >
761                     map_start + map_len) {
762                         if (map_token) {
763                                 unmap_extent_buffer(eb, map_token, KM_USER0);
764                                 map_token = NULL;
765                         }
766
767                         err = map_private_extent_buffer(eb, offset,
768                                                 sizeof(struct btrfs_disk_key),
769                                                 &map_token, &kaddr,
770                                                 &map_start, &map_len, KM_USER0);
771
772                         if (!err) {
773                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
774                                                         map_start);
775                         } else {
776                                 read_extent_buffer(eb, &unaligned,
777                                                    offset, sizeof(unaligned));
778                                 tmp = &unaligned;
779                         }
780
781                 } else {
782                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
783                                                         map_start);
784                 }
785                 ret = comp_keys(tmp, key);
786
787                 if (ret < 0)
788                         low = mid + 1;
789                 else if (ret > 0)
790                         high = mid;
791                 else {
792                         *slot = mid;
793                         if (map_token)
794                                 unmap_extent_buffer(eb, map_token, KM_USER0);
795                         return 0;
796                 }
797         }
798         *slot = low;
799         if (map_token)
800                 unmap_extent_buffer(eb, map_token, KM_USER0);
801         return 1;
802 }
803
804 /*
805  * simple bin_search frontend that does the right thing for
806  * leaves vs nodes
807  */
808 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
809                       int level, int *slot)
810 {
811         if (level == 0) {
812                 return generic_bin_search(eb,
813                                           offsetof(struct btrfs_leaf, items),
814                                           sizeof(struct btrfs_item),
815                                           key, btrfs_header_nritems(eb),
816                                           slot);
817         } else {
818                 return generic_bin_search(eb,
819                                           offsetof(struct btrfs_node, ptrs),
820                                           sizeof(struct btrfs_key_ptr),
821                                           key, btrfs_header_nritems(eb),
822                                           slot);
823         }
824         return -1;
825 }
826
827 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
828                      int level, int *slot)
829 {
830         return bin_search(eb, key, level, slot);
831 }
832
833 static void root_add_used(struct btrfs_root *root, u32 size)
834 {
835         spin_lock(&root->accounting_lock);
836         btrfs_set_root_used(&root->root_item,
837                             btrfs_root_used(&root->root_item) + size);
838         spin_unlock(&root->accounting_lock);
839 }
840
841 static void root_sub_used(struct btrfs_root *root, u32 size)
842 {
843         spin_lock(&root->accounting_lock);
844         btrfs_set_root_used(&root->root_item,
845                             btrfs_root_used(&root->root_item) - size);
846         spin_unlock(&root->accounting_lock);
847 }
848
849 /* given a node and slot number, this reads the blocks it points to.  The
850  * extent buffer is returned with a reference taken (but unlocked).
851  * NULL is returned on error.
852  */
853 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
854                                    struct extent_buffer *parent, int slot)
855 {
856         int level = btrfs_header_level(parent);
857         if (slot < 0)
858                 return NULL;
859         if (slot >= btrfs_header_nritems(parent))
860                 return NULL;
861
862         BUG_ON(level == 0);
863
864         return read_tree_block(root, btrfs_node_blockptr(parent, slot),
865                        btrfs_level_size(root, level - 1),
866                        btrfs_node_ptr_generation(parent, slot));
867 }
868
869 /*
870  * node level balancing, used to make sure nodes are in proper order for
871  * item deletion.  We balance from the top down, so we have to make sure
872  * that a deletion won't leave an node completely empty later on.
873  */
874 static noinline int balance_level(struct btrfs_trans_handle *trans,
875                          struct btrfs_root *root,
876                          struct btrfs_path *path, int level)
877 {
878         struct extent_buffer *right = NULL;
879         struct extent_buffer *mid;
880         struct extent_buffer *left = NULL;
881         struct extent_buffer *parent = NULL;
882         int ret = 0;
883         int wret;
884         int pslot;
885         int orig_slot = path->slots[level];
886         u64 orig_ptr;
887
888         if (level == 0)
889                 return 0;
890
891         mid = path->nodes[level];
892
893         WARN_ON(!path->locks[level]);
894         WARN_ON(btrfs_header_generation(mid) != trans->transid);
895
896         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
897
898         if (level < BTRFS_MAX_LEVEL - 1)
899                 parent = path->nodes[level + 1];
900         pslot = path->slots[level + 1];
901
902         /*
903          * deal with the case where there is only one pointer in the root
904          * by promoting the node below to a root
905          */
906         if (!parent) {
907                 struct extent_buffer *child;
908
909                 if (btrfs_header_nritems(mid) != 1)
910                         return 0;
911
912                 /* promote the child to a root */
913                 child = read_node_slot(root, mid, 0);
914                 BUG_ON(!child);
915                 btrfs_tree_lock(child);
916                 btrfs_set_lock_blocking(child);
917                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
918                 if (ret) {
919                         btrfs_tree_unlock(child);
920                         free_extent_buffer(child);
921                         goto enospc;
922                 }
923
924                 rcu_assign_pointer(root->node, child);
925
926                 add_root_to_dirty_list(root);
927                 btrfs_tree_unlock(child);
928
929                 path->locks[level] = 0;
930                 path->nodes[level] = NULL;
931                 clean_tree_block(trans, root, mid);
932                 btrfs_tree_unlock(mid);
933                 /* once for the path */
934                 free_extent_buffer(mid);
935
936                 root_sub_used(root, mid->len);
937                 btrfs_free_tree_block(trans, root, mid, 0, 1);
938                 /* once for the root ptr */
939                 free_extent_buffer(mid);
940                 return 0;
941         }
942         if (btrfs_header_nritems(mid) >
943             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
944                 return 0;
945
946         btrfs_header_nritems(mid);
947
948         left = read_node_slot(root, parent, pslot - 1);
949         if (left) {
950                 btrfs_tree_lock(left);
951                 btrfs_set_lock_blocking(left);
952                 wret = btrfs_cow_block(trans, root, left,
953                                        parent, pslot - 1, &left);
954                 if (wret) {
955                         ret = wret;
956                         goto enospc;
957                 }
958         }
959         right = read_node_slot(root, parent, pslot + 1);
960         if (right) {
961                 btrfs_tree_lock(right);
962                 btrfs_set_lock_blocking(right);
963                 wret = btrfs_cow_block(trans, root, right,
964                                        parent, pslot + 1, &right);
965                 if (wret) {
966                         ret = wret;
967                         goto enospc;
968                 }
969         }
970
971         /* first, try to make some room in the middle buffer */
972         if (left) {
973                 orig_slot += btrfs_header_nritems(left);
974                 wret = push_node_left(trans, root, left, mid, 1);
975                 if (wret < 0)
976                         ret = wret;
977                 btrfs_header_nritems(mid);
978         }
979
980         /*
981          * then try to empty the right most buffer into the middle
982          */
983         if (right) {
984                 wret = push_node_left(trans, root, mid, right, 1);
985                 if (wret < 0 && wret != -ENOSPC)
986                         ret = wret;
987                 if (btrfs_header_nritems(right) == 0) {
988                         clean_tree_block(trans, root, right);
989                         btrfs_tree_unlock(right);
990                         wret = del_ptr(trans, root, path, level + 1, pslot +
991                                        1);
992                         if (wret)
993                                 ret = wret;
994                         root_sub_used(root, right->len);
995                         btrfs_free_tree_block(trans, root, right, 0, 1);
996                         free_extent_buffer(right);
997                         right = NULL;
998                 } else {
999                         struct btrfs_disk_key right_key;
1000                         btrfs_node_key(right, &right_key, 0);
1001                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1002                         btrfs_mark_buffer_dirty(parent);
1003                 }
1004         }
1005         if (btrfs_header_nritems(mid) == 1) {
1006                 /*
1007                  * we're not allowed to leave a node with one item in the
1008                  * tree during a delete.  A deletion from lower in the tree
1009                  * could try to delete the only pointer in this node.
1010                  * So, pull some keys from the left.
1011                  * There has to be a left pointer at this point because
1012                  * otherwise we would have pulled some pointers from the
1013                  * right
1014                  */
1015                 BUG_ON(!left);
1016                 wret = balance_node_right(trans, root, mid, left);
1017                 if (wret < 0) {
1018                         ret = wret;
1019                         goto enospc;
1020                 }
1021                 if (wret == 1) {
1022                         wret = push_node_left(trans, root, left, mid, 1);
1023                         if (wret < 0)
1024                                 ret = wret;
1025                 }
1026                 BUG_ON(wret == 1);
1027         }
1028         if (btrfs_header_nritems(mid) == 0) {
1029                 clean_tree_block(trans, root, mid);
1030                 btrfs_tree_unlock(mid);
1031                 wret = del_ptr(trans, root, path, level + 1, pslot);
1032                 if (wret)
1033                         ret = wret;
1034                 root_sub_used(root, mid->len);
1035                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1036                 free_extent_buffer(mid);
1037                 mid = NULL;
1038         } else {
1039                 /* update the parent key to reflect our changes */
1040                 struct btrfs_disk_key mid_key;
1041                 btrfs_node_key(mid, &mid_key, 0);
1042                 btrfs_set_node_key(parent, &mid_key, pslot);
1043                 btrfs_mark_buffer_dirty(parent);
1044         }
1045
1046         /* update the path */
1047         if (left) {
1048                 if (btrfs_header_nritems(left) > orig_slot) {
1049                         extent_buffer_get(left);
1050                         /* left was locked after cow */
1051                         path->nodes[level] = left;
1052                         path->slots[level + 1] -= 1;
1053                         path->slots[level] = orig_slot;
1054                         if (mid) {
1055                                 btrfs_tree_unlock(mid);
1056                                 free_extent_buffer(mid);
1057                         }
1058                 } else {
1059                         orig_slot -= btrfs_header_nritems(left);
1060                         path->slots[level] = orig_slot;
1061                 }
1062         }
1063         /* double check we haven't messed things up */
1064         if (orig_ptr !=
1065             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1066                 BUG();
1067 enospc:
1068         if (right) {
1069                 btrfs_tree_unlock(right);
1070                 free_extent_buffer(right);
1071         }
1072         if (left) {
1073                 if (path->nodes[level] != left)
1074                         btrfs_tree_unlock(left);
1075                 free_extent_buffer(left);
1076         }
1077         return ret;
1078 }
1079
1080 /* Node balancing for insertion.  Here we only split or push nodes around
1081  * when they are completely full.  This is also done top down, so we
1082  * have to be pessimistic.
1083  */
1084 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1085                                           struct btrfs_root *root,
1086                                           struct btrfs_path *path, int level)
1087 {
1088         struct extent_buffer *right = NULL;
1089         struct extent_buffer *mid;
1090         struct extent_buffer *left = NULL;
1091         struct extent_buffer *parent = NULL;
1092         int ret = 0;
1093         int wret;
1094         int pslot;
1095         int orig_slot = path->slots[level];
1096
1097         if (level == 0)
1098                 return 1;
1099
1100         mid = path->nodes[level];
1101         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1102
1103         if (level < BTRFS_MAX_LEVEL - 1)
1104                 parent = path->nodes[level + 1];
1105         pslot = path->slots[level + 1];
1106
1107         if (!parent)
1108                 return 1;
1109
1110         left = read_node_slot(root, parent, pslot - 1);
1111
1112         /* first, try to make some room in the middle buffer */
1113         if (left) {
1114                 u32 left_nr;
1115
1116                 btrfs_tree_lock(left);
1117                 btrfs_set_lock_blocking(left);
1118
1119                 left_nr = btrfs_header_nritems(left);
1120                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1121                         wret = 1;
1122                 } else {
1123                         ret = btrfs_cow_block(trans, root, left, parent,
1124                                               pslot - 1, &left);
1125                         if (ret)
1126                                 wret = 1;
1127                         else {
1128                                 wret = push_node_left(trans, root,
1129                                                       left, mid, 0);
1130                         }
1131                 }
1132                 if (wret < 0)
1133                         ret = wret;
1134                 if (wret == 0) {
1135                         struct btrfs_disk_key disk_key;
1136                         orig_slot += left_nr;
1137                         btrfs_node_key(mid, &disk_key, 0);
1138                         btrfs_set_node_key(parent, &disk_key, pslot);
1139                         btrfs_mark_buffer_dirty(parent);
1140                         if (btrfs_header_nritems(left) > orig_slot) {
1141                                 path->nodes[level] = left;
1142                                 path->slots[level + 1] -= 1;
1143                                 path->slots[level] = orig_slot;
1144                                 btrfs_tree_unlock(mid);
1145                                 free_extent_buffer(mid);
1146                         } else {
1147                                 orig_slot -=
1148                                         btrfs_header_nritems(left);
1149                                 path->slots[level] = orig_slot;
1150                                 btrfs_tree_unlock(left);
1151                                 free_extent_buffer(left);
1152                         }
1153                         return 0;
1154                 }
1155                 btrfs_tree_unlock(left);
1156                 free_extent_buffer(left);
1157         }
1158         right = read_node_slot(root, parent, pslot + 1);
1159
1160         /*
1161          * then try to empty the right most buffer into the middle
1162          */
1163         if (right) {
1164                 u32 right_nr;
1165
1166                 btrfs_tree_lock(right);
1167                 btrfs_set_lock_blocking(right);
1168
1169                 right_nr = btrfs_header_nritems(right);
1170                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1171                         wret = 1;
1172                 } else {
1173                         ret = btrfs_cow_block(trans, root, right,
1174                                               parent, pslot + 1,
1175                                               &right);
1176                         if (ret)
1177                                 wret = 1;
1178                         else {
1179                                 wret = balance_node_right(trans, root,
1180                                                           right, mid);
1181                         }
1182                 }
1183                 if (wret < 0)
1184                         ret = wret;
1185                 if (wret == 0) {
1186                         struct btrfs_disk_key disk_key;
1187
1188                         btrfs_node_key(right, &disk_key, 0);
1189                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1190                         btrfs_mark_buffer_dirty(parent);
1191
1192                         if (btrfs_header_nritems(mid) <= orig_slot) {
1193                                 path->nodes[level] = right;
1194                                 path->slots[level + 1] += 1;
1195                                 path->slots[level] = orig_slot -
1196                                         btrfs_header_nritems(mid);
1197                                 btrfs_tree_unlock(mid);
1198                                 free_extent_buffer(mid);
1199                         } else {
1200                                 btrfs_tree_unlock(right);
1201                                 free_extent_buffer(right);
1202                         }
1203                         return 0;
1204                 }
1205                 btrfs_tree_unlock(right);
1206                 free_extent_buffer(right);
1207         }
1208         return 1;
1209 }
1210
1211 /*
1212  * readahead one full node of leaves, finding things that are close
1213  * to the block in 'slot', and triggering ra on them.
1214  */
1215 static void reada_for_search(struct btrfs_root *root,
1216                              struct btrfs_path *path,
1217                              int level, int slot, u64 objectid)
1218 {
1219         struct extent_buffer *node;
1220         struct btrfs_disk_key disk_key;
1221         u32 nritems;
1222         u64 search;
1223         u64 target;
1224         u64 nread = 0;
1225         u64 gen;
1226         int direction = path->reada;
1227         struct extent_buffer *eb;
1228         u32 nr;
1229         u32 blocksize;
1230         u32 nscan = 0;
1231
1232         if (level != 1)
1233                 return;
1234
1235         if (!path->nodes[level])
1236                 return;
1237
1238         node = path->nodes[level];
1239
1240         search = btrfs_node_blockptr(node, slot);
1241         blocksize = btrfs_level_size(root, level - 1);
1242         eb = btrfs_find_tree_block(root, search, blocksize);
1243         if (eb) {
1244                 free_extent_buffer(eb);
1245                 return;
1246         }
1247
1248         target = search;
1249
1250         nritems = btrfs_header_nritems(node);
1251         nr = slot;
1252         while (1) {
1253                 if (!node->map_token) {
1254                         unsigned long offset = btrfs_node_key_ptr_offset(nr);
1255                         map_private_extent_buffer(node, offset,
1256                                                   sizeof(struct btrfs_key_ptr),
1257                                                   &node->map_token,
1258                                                   &node->kaddr,
1259                                                   &node->map_start,
1260                                                   &node->map_len, KM_USER1);
1261                 }
1262                 if (direction < 0) {
1263                         if (nr == 0)
1264                                 break;
1265                         nr--;
1266                 } else if (direction > 0) {
1267                         nr++;
1268                         if (nr >= nritems)
1269                                 break;
1270                 }
1271                 if (path->reada < 0 && objectid) {
1272                         btrfs_node_key(node, &disk_key, nr);
1273                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1274                                 break;
1275                 }
1276                 search = btrfs_node_blockptr(node, nr);
1277                 if ((search <= target && target - search <= 65536) ||
1278                     (search > target && search - target <= 65536)) {
1279                         gen = btrfs_node_ptr_generation(node, nr);
1280                         if (node->map_token) {
1281                                 unmap_extent_buffer(node, node->map_token,
1282                                                     KM_USER1);
1283                                 node->map_token = NULL;
1284                         }
1285                         readahead_tree_block(root, search, blocksize, gen);
1286                         nread += blocksize;
1287                 }
1288                 nscan++;
1289                 if ((nread > 65536 || nscan > 32))
1290                         break;
1291         }
1292         if (node->map_token) {
1293                 unmap_extent_buffer(node, node->map_token, KM_USER1);
1294                 node->map_token = NULL;
1295         }
1296 }
1297
1298 /*
1299  * returns -EAGAIN if it had to drop the path, or zero if everything was in
1300  * cache
1301  */
1302 static noinline int reada_for_balance(struct btrfs_root *root,
1303                                       struct btrfs_path *path, int level)
1304 {
1305         int slot;
1306         int nritems;
1307         struct extent_buffer *parent;
1308         struct extent_buffer *eb;
1309         u64 gen;
1310         u64 block1 = 0;
1311         u64 block2 = 0;
1312         int ret = 0;
1313         int blocksize;
1314
1315         parent = path->nodes[level + 1];
1316         if (!parent)
1317                 return 0;
1318
1319         nritems = btrfs_header_nritems(parent);
1320         slot = path->slots[level + 1];
1321         blocksize = btrfs_level_size(root, level);
1322
1323         if (slot > 0) {
1324                 block1 = btrfs_node_blockptr(parent, slot - 1);
1325                 gen = btrfs_node_ptr_generation(parent, slot - 1);
1326                 eb = btrfs_find_tree_block(root, block1, blocksize);
1327                 if (eb && btrfs_buffer_uptodate(eb, gen))
1328                         block1 = 0;
1329                 free_extent_buffer(eb);
1330         }
1331         if (slot + 1 < nritems) {
1332                 block2 = btrfs_node_blockptr(parent, slot + 1);
1333                 gen = btrfs_node_ptr_generation(parent, slot + 1);
1334                 eb = btrfs_find_tree_block(root, block2, blocksize);
1335                 if (eb && btrfs_buffer_uptodate(eb, gen))
1336                         block2 = 0;
1337                 free_extent_buffer(eb);
1338         }
1339         if (block1 || block2) {
1340                 ret = -EAGAIN;
1341
1342                 /* release the whole path */
1343                 btrfs_release_path(path);
1344
1345                 /* read the blocks */
1346                 if (block1)
1347                         readahead_tree_block(root, block1, blocksize, 0);
1348                 if (block2)
1349                         readahead_tree_block(root, block2, blocksize, 0);
1350
1351                 if (block1) {
1352                         eb = read_tree_block(root, block1, blocksize, 0);
1353                         free_extent_buffer(eb);
1354                 }
1355                 if (block2) {
1356                         eb = read_tree_block(root, block2, blocksize, 0);
1357                         free_extent_buffer(eb);
1358                 }
1359         }
1360         return ret;
1361 }
1362
1363
1364 /*
1365  * when we walk down the tree, it is usually safe to unlock the higher layers
1366  * in the tree.  The exceptions are when our path goes through slot 0, because
1367  * operations on the tree might require changing key pointers higher up in the
1368  * tree.
1369  *
1370  * callers might also have set path->keep_locks, which tells this code to keep
1371  * the lock if the path points to the last slot in the block.  This is part of
1372  * walking through the tree, and selecting the next slot in the higher block.
1373  *
1374  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1375  * if lowest_unlock is 1, level 0 won't be unlocked
1376  */
1377 static noinline void unlock_up(struct btrfs_path *path, int level,
1378                                int lowest_unlock)
1379 {
1380         int i;
1381         int skip_level = level;
1382         int no_skips = 0;
1383         struct extent_buffer *t;
1384
1385         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1386                 if (!path->nodes[i])
1387                         break;
1388                 if (!path->locks[i])
1389                         break;
1390                 if (!no_skips && path->slots[i] == 0) {
1391                         skip_level = i + 1;
1392                         continue;
1393                 }
1394                 if (!no_skips && path->keep_locks) {
1395                         u32 nritems;
1396                         t = path->nodes[i];
1397                         nritems = btrfs_header_nritems(t);
1398                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
1399                                 skip_level = i + 1;
1400                                 continue;
1401                         }
1402                 }
1403                 if (skip_level < i && i >= lowest_unlock)
1404                         no_skips = 1;
1405
1406                 t = path->nodes[i];
1407                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1408                         btrfs_tree_unlock(t);
1409                         path->locks[i] = 0;
1410                 }
1411         }
1412 }
1413
1414 /*
1415  * This releases any locks held in the path starting at level and
1416  * going all the way up to the root.
1417  *
1418  * btrfs_search_slot will keep the lock held on higher nodes in a few
1419  * corner cases, such as COW of the block at slot zero in the node.  This
1420  * ignores those rules, and it should only be called when there are no
1421  * more updates to be done higher up in the tree.
1422  */
1423 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1424 {
1425         int i;
1426
1427         if (path->keep_locks)
1428                 return;
1429
1430         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1431                 if (!path->nodes[i])
1432                         continue;
1433                 if (!path->locks[i])
1434                         continue;
1435                 btrfs_tree_unlock(path->nodes[i]);
1436                 path->locks[i] = 0;
1437         }
1438 }
1439
1440 /*
1441  * helper function for btrfs_search_slot.  The goal is to find a block
1442  * in cache without setting the path to blocking.  If we find the block
1443  * we return zero and the path is unchanged.
1444  *
1445  * If we can't find the block, we set the path blocking and do some
1446  * reada.  -EAGAIN is returned and the search must be repeated.
1447  */
1448 static int
1449 read_block_for_search(struct btrfs_trans_handle *trans,
1450                        struct btrfs_root *root, struct btrfs_path *p,
1451                        struct extent_buffer **eb_ret, int level, int slot,
1452                        struct btrfs_key *key)
1453 {
1454         u64 blocknr;
1455         u64 gen;
1456         u32 blocksize;
1457         struct extent_buffer *b = *eb_ret;
1458         struct extent_buffer *tmp;
1459         int ret;
1460
1461         blocknr = btrfs_node_blockptr(b, slot);
1462         gen = btrfs_node_ptr_generation(b, slot);
1463         blocksize = btrfs_level_size(root, level - 1);
1464
1465         tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1466         if (tmp) {
1467                 if (btrfs_buffer_uptodate(tmp, 0)) {
1468                         if (btrfs_buffer_uptodate(tmp, gen)) {
1469                                 /*
1470                                  * we found an up to date block without
1471                                  * sleeping, return
1472                                  * right away
1473                                  */
1474                                 *eb_ret = tmp;
1475                                 return 0;
1476                         }
1477                         /* the pages were up to date, but we failed
1478                          * the generation number check.  Do a full
1479                          * read for the generation number that is correct.
1480                          * We must do this without dropping locks so
1481                          * we can trust our generation number
1482                          */
1483                         free_extent_buffer(tmp);
1484                         tmp = read_tree_block(root, blocknr, blocksize, gen);
1485                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1486                                 *eb_ret = tmp;
1487                                 return 0;
1488                         }
1489                         free_extent_buffer(tmp);
1490                         btrfs_release_path(p);
1491                         return -EIO;
1492                 }
1493         }
1494
1495         /*
1496          * reduce lock contention at high levels
1497          * of the btree by dropping locks before
1498          * we read.  Don't release the lock on the current
1499          * level because we need to walk this node to figure
1500          * out which blocks to read.
1501          */
1502         btrfs_unlock_up_safe(p, level + 1);
1503         btrfs_set_path_blocking(p);
1504
1505         free_extent_buffer(tmp);
1506         if (p->reada)
1507                 reada_for_search(root, p, level, slot, key->objectid);
1508
1509         btrfs_release_path(p);
1510
1511         ret = -EAGAIN;
1512         tmp = read_tree_block(root, blocknr, blocksize, 0);
1513         if (tmp) {
1514                 /*
1515                  * If the read above didn't mark this buffer up to date,
1516                  * it will never end up being up to date.  Set ret to EIO now
1517                  * and give up so that our caller doesn't loop forever
1518                  * on our EAGAINs.
1519                  */
1520                 if (!btrfs_buffer_uptodate(tmp, 0))
1521                         ret = -EIO;
1522                 free_extent_buffer(tmp);
1523         }
1524         return ret;
1525 }
1526
1527 /*
1528  * helper function for btrfs_search_slot.  This does all of the checks
1529  * for node-level blocks and does any balancing required based on
1530  * the ins_len.
1531  *
1532  * If no extra work was required, zero is returned.  If we had to
1533  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1534  * start over
1535  */
1536 static int
1537 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1538                        struct btrfs_root *root, struct btrfs_path *p,
1539                        struct extent_buffer *b, int level, int ins_len)
1540 {
1541         int ret;
1542         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1543             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1544                 int sret;
1545
1546                 sret = reada_for_balance(root, p, level);
1547                 if (sret)
1548                         goto again;
1549
1550                 btrfs_set_path_blocking(p);
1551                 sret = split_node(trans, root, p, level);
1552                 btrfs_clear_path_blocking(p, NULL);
1553
1554                 BUG_ON(sret > 0);
1555                 if (sret) {
1556                         ret = sret;
1557                         goto done;
1558                 }
1559                 b = p->nodes[level];
1560         } else if (ins_len < 0 && btrfs_header_nritems(b) <
1561                    BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1562                 int sret;
1563
1564                 sret = reada_for_balance(root, p, level);
1565                 if (sret)
1566                         goto again;
1567
1568                 btrfs_set_path_blocking(p);
1569                 sret = balance_level(trans, root, p, level);
1570                 btrfs_clear_path_blocking(p, NULL);
1571
1572                 if (sret) {
1573                         ret = sret;
1574                         goto done;
1575                 }
1576                 b = p->nodes[level];
1577                 if (!b) {
1578                         btrfs_release_path(p);
1579                         goto again;
1580                 }
1581                 BUG_ON(btrfs_header_nritems(b) == 1);
1582         }
1583         return 0;
1584
1585 again:
1586         ret = -EAGAIN;
1587 done:
1588         return ret;
1589 }
1590
1591 /*
1592  * look for key in the tree.  path is filled in with nodes along the way
1593  * if key is found, we return zero and you can find the item in the leaf
1594  * level of the path (level 0)
1595  *
1596  * If the key isn't found, the path points to the slot where it should
1597  * be inserted, and 1 is returned.  If there are other errors during the
1598  * search a negative error number is returned.
1599  *
1600  * if ins_len > 0, nodes and leaves will be split as we walk down the
1601  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
1602  * possible)
1603  */
1604 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1605                       *root, struct btrfs_key *key, struct btrfs_path *p, int
1606                       ins_len, int cow)
1607 {
1608         struct extent_buffer *b;
1609         int slot;
1610         int ret;
1611         int err;
1612         int level;
1613         int lowest_unlock = 1;
1614         u8 lowest_level = 0;
1615
1616         lowest_level = p->lowest_level;
1617         WARN_ON(lowest_level && ins_len > 0);
1618         WARN_ON(p->nodes[0] != NULL);
1619
1620         if (ins_len < 0)
1621                 lowest_unlock = 2;
1622
1623 again:
1624         if (p->search_commit_root) {
1625                 b = root->commit_root;
1626                 extent_buffer_get(b);
1627                 if (!p->skip_locking)
1628                         btrfs_tree_lock(b);
1629         } else {
1630                 if (p->skip_locking)
1631                         b = btrfs_root_node(root);
1632                 else
1633                         b = btrfs_lock_root_node(root);
1634         }
1635
1636         while (b) {
1637                 level = btrfs_header_level(b);
1638
1639                 /*
1640                  * setup the path here so we can release it under lock
1641                  * contention with the cow code
1642                  */
1643                 p->nodes[level] = b;
1644                 if (!p->skip_locking)
1645                         p->locks[level] = 1;
1646
1647                 if (cow) {
1648                         /*
1649                          * if we don't really need to cow this block
1650                          * then we don't want to set the path blocking,
1651                          * so we test it here
1652                          */
1653                         if (!should_cow_block(trans, root, b))
1654                                 goto cow_done;
1655
1656                         btrfs_set_path_blocking(p);
1657
1658                         err = btrfs_cow_block(trans, root, b,
1659                                               p->nodes[level + 1],
1660                                               p->slots[level + 1], &b);
1661                         if (err) {
1662                                 ret = err;
1663                                 goto done;
1664                         }
1665                 }
1666 cow_done:
1667                 BUG_ON(!cow && ins_len);
1668
1669                 p->nodes[level] = b;
1670                 if (!p->skip_locking)
1671                         p->locks[level] = 1;
1672
1673                 btrfs_clear_path_blocking(p, NULL);
1674
1675                 /*
1676                  * we have a lock on b and as long as we aren't changing
1677                  * the tree, there is no way to for the items in b to change.
1678                  * It is safe to drop the lock on our parent before we
1679                  * go through the expensive btree search on b.
1680                  *
1681                  * If cow is true, then we might be changing slot zero,
1682                  * which may require changing the parent.  So, we can't
1683                  * drop the lock until after we know which slot we're
1684                  * operating on.
1685                  */
1686                 if (!cow)
1687                         btrfs_unlock_up_safe(p, level + 1);
1688
1689                 ret = bin_search(b, key, level, &slot);
1690
1691                 if (level != 0) {
1692                         int dec = 0;
1693                         if (ret && slot > 0) {
1694                                 dec = 1;
1695                                 slot -= 1;
1696                         }
1697                         p->slots[level] = slot;
1698                         err = setup_nodes_for_search(trans, root, p, b, level,
1699                                                      ins_len);
1700                         if (err == -EAGAIN)
1701                                 goto again;
1702                         if (err) {
1703                                 ret = err;
1704                                 goto done;
1705                         }
1706                         b = p->nodes[level];
1707                         slot = p->slots[level];
1708
1709                         unlock_up(p, level, lowest_unlock);
1710
1711                         if (level == lowest_level) {
1712                                 if (dec)
1713                                         p->slots[level]++;
1714                                 goto done;
1715                         }
1716
1717                         err = read_block_for_search(trans, root, p,
1718                                                     &b, level, slot, key);
1719                         if (err == -EAGAIN)
1720                                 goto again;
1721                         if (err) {
1722                                 ret = err;
1723                                 goto done;
1724                         }
1725
1726                         if (!p->skip_locking) {
1727                                 btrfs_clear_path_blocking(p, NULL);
1728                                 err = btrfs_try_spin_lock(b);
1729
1730                                 if (!err) {
1731                                         btrfs_set_path_blocking(p);
1732                                         btrfs_tree_lock(b);
1733                                         btrfs_clear_path_blocking(p, b);
1734                                 }
1735                         }
1736                 } else {
1737                         p->slots[level] = slot;
1738                         if (ins_len > 0 &&
1739                             btrfs_leaf_free_space(root, b) < ins_len) {
1740                                 btrfs_set_path_blocking(p);
1741                                 err = split_leaf(trans, root, key,
1742                                                  p, ins_len, ret == 0);
1743                                 btrfs_clear_path_blocking(p, NULL);
1744
1745                                 BUG_ON(err > 0);
1746                                 if (err) {
1747                                         ret = err;
1748                                         goto done;
1749                                 }
1750                         }
1751                         if (!p->search_for_split)
1752                                 unlock_up(p, level, lowest_unlock);
1753                         goto done;
1754                 }
1755         }
1756         ret = 1;
1757 done:
1758         /*
1759          * we don't really know what they plan on doing with the path
1760          * from here on, so for now just mark it as blocking
1761          */
1762         if (!p->leave_spinning)
1763                 btrfs_set_path_blocking(p);
1764         if (ret < 0)
1765                 btrfs_release_path(p);
1766         return ret;
1767 }
1768
1769 /*
1770  * adjust the pointers going up the tree, starting at level
1771  * making sure the right key of each node is points to 'key'.
1772  * This is used after shifting pointers to the left, so it stops
1773  * fixing up pointers when a given leaf/node is not in slot 0 of the
1774  * higher levels
1775  *
1776  * If this fails to write a tree block, it returns -1, but continues
1777  * fixing up the blocks in ram so the tree is consistent.
1778  */
1779 static int fixup_low_keys(struct btrfs_trans_handle *trans,
1780                           struct btrfs_root *root, struct btrfs_path *path,
1781                           struct btrfs_disk_key *key, int level)
1782 {
1783         int i;
1784         int ret = 0;
1785         struct extent_buffer *t;
1786
1787         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1788                 int tslot = path->slots[i];
1789                 if (!path->nodes[i])
1790                         break;
1791                 t = path->nodes[i];
1792                 btrfs_set_node_key(t, key, tslot);
1793                 btrfs_mark_buffer_dirty(path->nodes[i]);
1794                 if (tslot != 0)
1795                         break;
1796         }
1797         return ret;
1798 }
1799
1800 /*
1801  * update item key.
1802  *
1803  * This function isn't completely safe. It's the caller's responsibility
1804  * that the new key won't break the order
1805  */
1806 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1807                             struct btrfs_root *root, struct btrfs_path *path,
1808                             struct btrfs_key *new_key)
1809 {
1810         struct btrfs_disk_key disk_key;
1811         struct extent_buffer *eb;
1812         int slot;
1813
1814         eb = path->nodes[0];
1815         slot = path->slots[0];
1816         if (slot > 0) {
1817                 btrfs_item_key(eb, &disk_key, slot - 1);
1818                 if (comp_keys(&disk_key, new_key) >= 0)
1819                         return -1;
1820         }
1821         if (slot < btrfs_header_nritems(eb) - 1) {
1822                 btrfs_item_key(eb, &disk_key, slot + 1);
1823                 if (comp_keys(&disk_key, new_key) <= 0)
1824                         return -1;
1825         }
1826
1827         btrfs_cpu_key_to_disk(&disk_key, new_key);
1828         btrfs_set_item_key(eb, &disk_key, slot);
1829         btrfs_mark_buffer_dirty(eb);
1830         if (slot == 0)
1831                 fixup_low_keys(trans, root, path, &disk_key, 1);
1832         return 0;
1833 }
1834
1835 /*
1836  * try to push data from one node into the next node left in the
1837  * tree.
1838  *
1839  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1840  * error, and > 0 if there was no room in the left hand block.
1841  */
1842 static int push_node_left(struct btrfs_trans_handle *trans,
1843                           struct btrfs_root *root, struct extent_buffer *dst,
1844                           struct extent_buffer *src, int empty)
1845 {
1846         int push_items = 0;
1847         int src_nritems;
1848         int dst_nritems;
1849         int ret = 0;
1850
1851         src_nritems = btrfs_header_nritems(src);
1852         dst_nritems = btrfs_header_nritems(dst);
1853         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1854         WARN_ON(btrfs_header_generation(src) != trans->transid);
1855         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1856
1857         if (!empty && src_nritems <= 8)
1858                 return 1;
1859
1860         if (push_items <= 0)
1861                 return 1;
1862
1863         if (empty) {
1864                 push_items = min(src_nritems, push_items);
1865                 if (push_items < src_nritems) {
1866                         /* leave at least 8 pointers in the node if
1867                          * we aren't going to empty it
1868                          */
1869                         if (src_nritems - push_items < 8) {
1870                                 if (push_items <= 8)
1871                                         return 1;
1872                                 push_items -= 8;
1873                         }
1874                 }
1875         } else
1876                 push_items = min(src_nritems - 8, push_items);
1877
1878         copy_extent_buffer(dst, src,
1879                            btrfs_node_key_ptr_offset(dst_nritems),
1880                            btrfs_node_key_ptr_offset(0),
1881                            push_items * sizeof(struct btrfs_key_ptr));
1882
1883         if (push_items < src_nritems) {
1884                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1885                                       btrfs_node_key_ptr_offset(push_items),
1886                                       (src_nritems - push_items) *
1887                                       sizeof(struct btrfs_key_ptr));
1888         }
1889         btrfs_set_header_nritems(src, src_nritems - push_items);
1890         btrfs_set_header_nritems(dst, dst_nritems + push_items);
1891         btrfs_mark_buffer_dirty(src);
1892         btrfs_mark_buffer_dirty(dst);
1893
1894         return ret;
1895 }
1896
1897 /*
1898  * try to push data from one node into the next node right in the
1899  * tree.
1900  *
1901  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1902  * error, and > 0 if there was no room in the right hand block.
1903  *
1904  * this will  only push up to 1/2 the contents of the left node over
1905  */
1906 static int balance_node_right(struct btrfs_trans_handle *trans,
1907                               struct btrfs_root *root,
1908                               struct extent_buffer *dst,
1909                               struct extent_buffer *src)
1910 {
1911         int push_items = 0;
1912         int max_push;
1913         int src_nritems;
1914         int dst_nritems;
1915         int ret = 0;
1916
1917         WARN_ON(btrfs_header_generation(src) != trans->transid);
1918         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1919
1920         src_nritems = btrfs_header_nritems(src);
1921         dst_nritems = btrfs_header_nritems(dst);
1922         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1923         if (push_items <= 0)
1924                 return 1;
1925
1926         if (src_nritems < 4)
1927                 return 1;
1928
1929         max_push = src_nritems / 2 + 1;
1930         /* don't try to empty the node */
1931         if (max_push >= src_nritems)
1932                 return 1;
1933
1934         if (max_push < push_items)
1935                 push_items = max_push;
1936
1937         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1938                                       btrfs_node_key_ptr_offset(0),
1939                                       (dst_nritems) *
1940                                       sizeof(struct btrfs_key_ptr));
1941
1942         copy_extent_buffer(dst, src,
1943                            btrfs_node_key_ptr_offset(0),
1944                            btrfs_node_key_ptr_offset(src_nritems - push_items),
1945                            push_items * sizeof(struct btrfs_key_ptr));
1946
1947         btrfs_set_header_nritems(src, src_nritems - push_items);
1948         btrfs_set_header_nritems(dst, dst_nritems + push_items);
1949
1950         btrfs_mark_buffer_dirty(src);
1951         btrfs_mark_buffer_dirty(dst);
1952
1953         return ret;
1954 }
1955
1956 /*
1957  * helper function to insert a new root level in the tree.
1958  * A new node is allocated, and a single item is inserted to
1959  * point to the existing root
1960  *
1961  * returns zero on success or < 0 on failure.
1962  */
1963 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
1964                            struct btrfs_root *root,
1965                            struct btrfs_path *path, int level)
1966 {
1967         u64 lower_gen;
1968         struct extent_buffer *lower;
1969         struct extent_buffer *c;
1970         struct extent_buffer *old;
1971         struct btrfs_disk_key lower_key;
1972
1973         BUG_ON(path->nodes[level]);
1974         BUG_ON(path->nodes[level-1] != root->node);
1975
1976         lower = path->nodes[level-1];
1977         if (level == 1)
1978                 btrfs_item_key(lower, &lower_key, 0);
1979         else
1980                 btrfs_node_key(lower, &lower_key, 0);
1981
1982         c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1983                                    root->root_key.objectid, &lower_key,
1984                                    level, root->node->start, 0);
1985         if (IS_ERR(c))
1986                 return PTR_ERR(c);
1987
1988         root_add_used(root, root->nodesize);
1989
1990         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
1991         btrfs_set_header_nritems(c, 1);
1992         btrfs_set_header_level(c, level);
1993         btrfs_set_header_bytenr(c, c->start);
1994         btrfs_set_header_generation(c, trans->transid);
1995         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
1996         btrfs_set_header_owner(c, root->root_key.objectid);
1997
1998         write_extent_buffer(c, root->fs_info->fsid,
1999                             (unsigned long)btrfs_header_fsid(c),
2000                             BTRFS_FSID_SIZE);
2001
2002         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
2003                             (unsigned long)btrfs_header_chunk_tree_uuid(c),
2004                             BTRFS_UUID_SIZE);
2005
2006         btrfs_set_node_key(c, &lower_key, 0);
2007         btrfs_set_node_blockptr(c, 0, lower->start);
2008         lower_gen = btrfs_header_generation(lower);
2009         WARN_ON(lower_gen != trans->transid);
2010
2011         btrfs_set_node_ptr_generation(c, 0, lower_gen);
2012
2013         btrfs_mark_buffer_dirty(c);
2014
2015         old = root->node;
2016         rcu_assign_pointer(root->node, c);
2017
2018         /* the super has an extra ref to root->node */
2019         free_extent_buffer(old);
2020
2021         add_root_to_dirty_list(root);
2022         extent_buffer_get(c);
2023         path->nodes[level] = c;
2024         path->locks[level] = 1;
2025         path->slots[level] = 0;
2026         return 0;
2027 }
2028
2029 /*
2030  * worker function to insert a single pointer in a node.
2031  * the node should have enough room for the pointer already
2032  *
2033  * slot and level indicate where you want the key to go, and
2034  * blocknr is the block the key points to.
2035  *
2036  * returns zero on success and < 0 on any error
2037  */
2038 static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2039                       *root, struct btrfs_path *path, struct btrfs_disk_key
2040                       *key, u64 bytenr, int slot, int level)
2041 {
2042         struct extent_buffer *lower;
2043         int nritems;
2044
2045         BUG_ON(!path->nodes[level]);
2046         btrfs_assert_tree_locked(path->nodes[level]);
2047         lower = path->nodes[level];
2048         nritems = btrfs_header_nritems(lower);
2049         BUG_ON(slot > nritems);
2050         if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2051                 BUG();
2052         if (slot != nritems) {
2053                 memmove_extent_buffer(lower,
2054                               btrfs_node_key_ptr_offset(slot + 1),
2055                               btrfs_node_key_ptr_offset(slot),
2056                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2057         }
2058         btrfs_set_node_key(lower, key, slot);
2059         btrfs_set_node_blockptr(lower, slot, bytenr);
2060         WARN_ON(trans->transid == 0);
2061         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2062         btrfs_set_header_nritems(lower, nritems + 1);
2063         btrfs_mark_buffer_dirty(lower);
2064         return 0;
2065 }
2066
2067 /*
2068  * split the node at the specified level in path in two.
2069  * The path is corrected to point to the appropriate node after the split
2070  *
2071  * Before splitting this tries to make some room in the node by pushing
2072  * left and right, if either one works, it returns right away.
2073  *
2074  * returns 0 on success and < 0 on failure
2075  */
2076 static noinline int split_node(struct btrfs_trans_handle *trans,
2077                                struct btrfs_root *root,
2078                                struct btrfs_path *path, int level)
2079 {
2080         struct extent_buffer *c;
2081         struct extent_buffer *split;
2082         struct btrfs_disk_key disk_key;
2083         int mid;
2084         int ret;
2085         int wret;
2086         u32 c_nritems;
2087
2088         c = path->nodes[level];
2089         WARN_ON(btrfs_header_generation(c) != trans->transid);
2090         if (c == root->node) {
2091                 /* trying to split the root, lets make a new one */
2092                 ret = insert_new_root(trans, root, path, level + 1);
2093                 if (ret)
2094                         return ret;
2095         } else {
2096                 ret = push_nodes_for_insert(trans, root, path, level);
2097                 c = path->nodes[level];
2098                 if (!ret && btrfs_header_nritems(c) <
2099                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2100                         return 0;
2101                 if (ret < 0)
2102                         return ret;
2103         }
2104
2105         c_nritems = btrfs_header_nritems(c);
2106         mid = (c_nritems + 1) / 2;
2107         btrfs_node_key(c, &disk_key, mid);
2108
2109         split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2110                                         root->root_key.objectid,
2111                                         &disk_key, level, c->start, 0);
2112         if (IS_ERR(split))
2113                 return PTR_ERR(split);
2114
2115         root_add_used(root, root->nodesize);
2116
2117         memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2118         btrfs_set_header_level(split, btrfs_header_level(c));
2119         btrfs_set_header_bytenr(split, split->start);
2120         btrfs_set_header_generation(split, trans->transid);
2121         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
2122         btrfs_set_header_owner(split, root->root_key.objectid);
2123         write_extent_buffer(split, root->fs_info->fsid,
2124                             (unsigned long)btrfs_header_fsid(split),
2125                             BTRFS_FSID_SIZE);
2126         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2127                             (unsigned long)btrfs_header_chunk_tree_uuid(split),
2128                             BTRFS_UUID_SIZE);
2129
2130
2131         copy_extent_buffer(split, c,
2132                            btrfs_node_key_ptr_offset(0),
2133                            btrfs_node_key_ptr_offset(mid),
2134                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2135         btrfs_set_header_nritems(split, c_nritems - mid);
2136         btrfs_set_header_nritems(c, mid);
2137         ret = 0;
2138
2139         btrfs_mark_buffer_dirty(c);
2140         btrfs_mark_buffer_dirty(split);
2141
2142         wret = insert_ptr(trans, root, path, &disk_key, split->start,
2143                           path->slots[level + 1] + 1,
2144                           level + 1);
2145         if (wret)
2146                 ret = wret;
2147
2148         if (path->slots[level] >= mid) {
2149                 path->slots[level] -= mid;
2150                 btrfs_tree_unlock(c);
2151                 free_extent_buffer(c);
2152                 path->nodes[level] = split;
2153                 path->slots[level + 1] += 1;
2154         } else {
2155                 btrfs_tree_unlock(split);
2156                 free_extent_buffer(split);
2157         }
2158         return ret;
2159 }
2160
2161 /*
2162  * how many bytes are required to store the items in a leaf.  start
2163  * and nr indicate which items in the leaf to check.  This totals up the
2164  * space used both by the item structs and the item data
2165  */
2166 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2167 {
2168         int data_len;
2169         int nritems = btrfs_header_nritems(l);
2170         int end = min(nritems, start + nr) - 1;
2171
2172         if (!nr)
2173                 return 0;
2174         data_len = btrfs_item_end_nr(l, start);
2175         data_len = data_len - btrfs_item_offset_nr(l, end);
2176         data_len += sizeof(struct btrfs_item) * nr;
2177         WARN_ON(data_len < 0);
2178         return data_len;
2179 }
2180
2181 /*
2182  * The space between the end of the leaf items and
2183  * the start of the leaf data.  IOW, how much room
2184  * the leaf has left for both items and data
2185  */
2186 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2187                                    struct extent_buffer *leaf)
2188 {
2189         int nritems = btrfs_header_nritems(leaf);
2190         int ret;
2191         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2192         if (ret < 0) {
2193                 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2194                        "used %d nritems %d\n",
2195                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2196                        leaf_space_used(leaf, 0, nritems), nritems);
2197         }
2198         return ret;
2199 }
2200
2201 /*
2202  * min slot controls the lowest index we're willing to push to the
2203  * right.  We'll push up to and including min_slot, but no lower
2204  */
2205 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2206                                       struct btrfs_root *root,
2207                                       struct btrfs_path *path,
2208                                       int data_size, int empty,
2209                                       struct extent_buffer *right,
2210                                       int free_space, u32 left_nritems,
2211                                       u32 min_slot)
2212 {
2213         struct extent_buffer *left = path->nodes[0];
2214         struct extent_buffer *upper = path->nodes[1];
2215         struct btrfs_disk_key disk_key;
2216         int slot;
2217         u32 i;
2218         int push_space = 0;
2219         int push_items = 0;
2220         struct btrfs_item *item;
2221         u32 nr;
2222         u32 right_nritems;
2223         u32 data_end;
2224         u32 this_item_size;
2225
2226         if (empty)
2227                 nr = 0;
2228         else
2229                 nr = max_t(u32, 1, min_slot);
2230
2231         if (path->slots[0] >= left_nritems)
2232                 push_space += data_size;
2233
2234         slot = path->slots[1];
2235         i = left_nritems - 1;
2236         while (i >= nr) {
2237                 item = btrfs_item_nr(left, i);
2238
2239                 if (!empty && push_items > 0) {
2240                         if (path->slots[0] > i)
2241                                 break;
2242                         if (path->slots[0] == i) {
2243                                 int space = btrfs_leaf_free_space(root, left);
2244                                 if (space + push_space * 2 > free_space)
2245                                         break;
2246                         }
2247                 }
2248
2249                 if (path->slots[0] == i)
2250                         push_space += data_size;
2251
2252                 if (!left->map_token) {
2253                         map_extent_buffer(left, (unsigned long)item,
2254                                         sizeof(struct btrfs_item),
2255                                         &left->map_token, &left->kaddr,
2256                                         &left->map_start, &left->map_len,
2257                                         KM_USER1);
2258                 }
2259
2260                 this_item_size = btrfs_item_size(left, item);
2261                 if (this_item_size + sizeof(*item) + push_space > free_space)
2262                         break;
2263
2264                 push_items++;
2265                 push_space += this_item_size + sizeof(*item);
2266                 if (i == 0)
2267                         break;
2268                 i--;
2269         }
2270         if (left->map_token) {
2271                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2272                 left->map_token = NULL;
2273         }
2274
2275         if (push_items == 0)
2276                 goto out_unlock;
2277
2278         if (!empty && push_items == left_nritems)
2279                 WARN_ON(1);
2280
2281         /* push left to right */
2282         right_nritems = btrfs_header_nritems(right);
2283
2284         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2285         push_space -= leaf_data_end(root, left);
2286
2287         /* make room in the right data area */
2288         data_end = leaf_data_end(root, right);
2289         memmove_extent_buffer(right,
2290                               btrfs_leaf_data(right) + data_end - push_space,
2291                               btrfs_leaf_data(right) + data_end,
2292                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
2293
2294         /* copy from the left data area */
2295         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2296                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
2297                      btrfs_leaf_data(left) + leaf_data_end(root, left),
2298                      push_space);
2299
2300         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2301                               btrfs_item_nr_offset(0),
2302                               right_nritems * sizeof(struct btrfs_item));
2303
2304         /* copy the items from left to right */
2305         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2306                    btrfs_item_nr_offset(left_nritems - push_items),
2307                    push_items * sizeof(struct btrfs_item));
2308
2309         /* update the item pointers */
2310         right_nritems += push_items;
2311         btrfs_set_header_nritems(right, right_nritems);
2312         push_space = BTRFS_LEAF_DATA_SIZE(root);
2313         for (i = 0; i < right_nritems; i++) {
2314                 item = btrfs_item_nr(right, i);
2315                 if (!right->map_token) {
2316                         map_extent_buffer(right, (unsigned long)item,
2317                                         sizeof(struct btrfs_item),
2318                                         &right->map_token, &right->kaddr,
2319                                         &right->map_start, &right->map_len,
2320                                         KM_USER1);
2321                 }
2322                 push_space -= btrfs_item_size(right, item);
2323                 btrfs_set_item_offset(right, item, push_space);
2324         }
2325
2326         if (right->map_token) {
2327                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2328                 right->map_token = NULL;
2329         }
2330         left_nritems -= push_items;
2331         btrfs_set_header_nritems(left, left_nritems);
2332
2333         if (left_nritems)
2334                 btrfs_mark_buffer_dirty(left);
2335         else
2336                 clean_tree_block(trans, root, left);
2337
2338         btrfs_mark_buffer_dirty(right);
2339
2340         btrfs_item_key(right, &disk_key, 0);
2341         btrfs_set_node_key(upper, &disk_key, slot + 1);
2342         btrfs_mark_buffer_dirty(upper);
2343
2344         /* then fixup the leaf pointer in the path */
2345         if (path->slots[0] >= left_nritems) {
2346                 path->slots[0] -= left_nritems;
2347                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2348                         clean_tree_block(trans, root, path->nodes[0]);
2349                 btrfs_tree_unlock(path->nodes[0]);
2350                 free_extent_buffer(path->nodes[0]);
2351                 path->nodes[0] = right;
2352                 path->slots[1] += 1;
2353         } else {
2354                 btrfs_tree_unlock(right);
2355                 free_extent_buffer(right);
2356         }
2357         return 0;
2358
2359 out_unlock:
2360         btrfs_tree_unlock(right);
2361         free_extent_buffer(right);
2362         return 1;
2363 }
2364
2365 /*
2366  * push some data in the path leaf to the right, trying to free up at
2367  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2368  *
2369  * returns 1 if the push failed because the other node didn't have enough
2370  * room, 0 if everything worked out and < 0 if there were major errors.
2371  *
2372  * this will push starting from min_slot to the end of the leaf.  It won't
2373  * push any slot lower than min_slot
2374  */
2375 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2376                            *root, struct btrfs_path *path,
2377                            int min_data_size, int data_size,
2378                            int empty, u32 min_slot)
2379 {
2380         struct extent_buffer *left = path->nodes[0];
2381         struct extent_buffer *right;
2382         struct extent_buffer *upper;
2383         int slot;
2384         int free_space;
2385         u32 left_nritems;
2386         int ret;
2387
2388         if (!path->nodes[1])
2389                 return 1;
2390
2391         slot = path->slots[1];
2392         upper = path->nodes[1];
2393         if (slot >= btrfs_header_nritems(upper) - 1)
2394                 return 1;
2395
2396         btrfs_assert_tree_locked(path->nodes[1]);
2397
2398         right = read_node_slot(root, upper, slot + 1);
2399         if (right == NULL)
2400                 return 1;
2401
2402         btrfs_tree_lock(right);
2403         btrfs_set_lock_blocking(right);
2404
2405         free_space = btrfs_leaf_free_space(root, right);
2406         if (free_space < data_size)
2407                 goto out_unlock;
2408
2409         /* cow and double check */
2410         ret = btrfs_cow_block(trans, root, right, upper,
2411                               slot + 1, &right);
2412         if (ret)
2413                 goto out_unlock;
2414
2415         free_space = btrfs_leaf_free_space(root, right);
2416         if (free_space < data_size)
2417                 goto out_unlock;
2418
2419         left_nritems = btrfs_header_nritems(left);
2420         if (left_nritems == 0)
2421                 goto out_unlock;
2422
2423         return __push_leaf_right(trans, root, path, min_data_size, empty,
2424                                 right, free_space, left_nritems, min_slot);
2425 out_unlock:
2426         btrfs_tree_unlock(right);
2427         free_extent_buffer(right);
2428         return 1;
2429 }
2430
2431 /*
2432  * push some data in the path leaf to the left, trying to free up at
2433  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2434  *
2435  * max_slot can put a limit on how far into the leaf we'll push items.  The
2436  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
2437  * items
2438  */
2439 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2440                                      struct btrfs_root *root,
2441                                      struct btrfs_path *path, int data_size,
2442                                      int empty, struct extent_buffer *left,
2443                                      int free_space, u32 right_nritems,
2444                                      u32 max_slot)
2445 {
2446         struct btrfs_disk_key disk_key;
2447         struct extent_buffer *right = path->nodes[0];
2448         int i;
2449         int push_space = 0;
2450         int push_items = 0;
2451         struct btrfs_item *item;
2452         u32 old_left_nritems;
2453         u32 nr;
2454         int ret = 0;
2455         int wret;
2456         u32 this_item_size;
2457         u32 old_left_item_size;
2458
2459         if (empty)
2460                 nr = min(right_nritems, max_slot);
2461         else
2462                 nr = min(right_nritems - 1, max_slot);
2463
2464         for (i = 0; i < nr; i++) {
2465                 item = btrfs_item_nr(right, i);
2466                 if (!right->map_token) {
2467                         map_extent_buffer(right, (unsigned long)item,
2468                                         sizeof(struct btrfs_item),
2469                                         &right->map_token, &right->kaddr,
2470                                         &right->map_start, &right->map_len,
2471                                         KM_USER1);
2472                 }
2473
2474                 if (!empty && push_items > 0) {
2475                         if (path->slots[0] < i)
2476                                 break;
2477                         if (path->slots[0] == i) {
2478                                 int space = btrfs_leaf_free_space(root, right);
2479                                 if (space + push_space * 2 > free_space)
2480                                         break;
2481                         }
2482                 }
2483
2484                 if (path->slots[0] == i)
2485                         push_space += data_size;
2486
2487                 this_item_size = btrfs_item_size(right, item);
2488                 if (this_item_size + sizeof(*item) + push_space > free_space)
2489                         break;
2490
2491                 push_items++;
2492                 push_space += this_item_size + sizeof(*item);
2493         }
2494
2495         if (right->map_token) {
2496                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2497                 right->map_token = NULL;
2498         }
2499
2500         if (push_items == 0) {
2501                 ret = 1;
2502                 goto out;
2503         }
2504         if (!empty && push_items == btrfs_header_nritems(right))
2505                 WARN_ON(1);
2506
2507         /* push data from right to left */
2508         copy_extent_buffer(left, right,
2509                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
2510                            btrfs_item_nr_offset(0),
2511                            push_items * sizeof(struct btrfs_item));
2512
2513         push_space = BTRFS_LEAF_DATA_SIZE(root) -
2514                      btrfs_item_offset_nr(right, push_items - 1);
2515
2516         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2517                      leaf_data_end(root, left) - push_space,
2518                      btrfs_leaf_data(right) +
2519                      btrfs_item_offset_nr(right, push_items - 1),
2520                      push_space);
2521         old_left_nritems = btrfs_header_nritems(left);
2522         BUG_ON(old_left_nritems <= 0);
2523
2524         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2525         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2526                 u32 ioff;
2527
2528                 item = btrfs_item_nr(left, i);
2529                 if (!left->map_token) {
2530                         map_extent_buffer(left, (unsigned long)item,
2531                                         sizeof(struct btrfs_item),
2532                                         &left->map_token, &left->kaddr,
2533                                         &left->map_start, &left->map_len,
2534                                         KM_USER1);
2535                 }
2536
2537                 ioff = btrfs_item_offset(left, item);
2538                 btrfs_set_item_offset(left, item,
2539                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2540         }
2541         btrfs_set_header_nritems(left, old_left_nritems + push_items);
2542         if (left->map_token) {
2543                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2544                 left->map_token = NULL;
2545         }
2546
2547         /* fixup right node */
2548         if (push_items > right_nritems) {
2549                 printk(KERN_CRIT "push items %d nr %u\n", push_items,
2550                        right_nritems);
2551                 WARN_ON(1);
2552         }
2553
2554         if (push_items < right_nritems) {
2555                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2556                                                   leaf_data_end(root, right);
2557                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2558                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
2559                                       btrfs_leaf_data(right) +
2560                                       leaf_data_end(root, right), push_space);
2561
2562                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2563                               btrfs_item_nr_offset(push_items),
2564                              (btrfs_header_nritems(right) - push_items) *
2565                              sizeof(struct btrfs_item));
2566         }
2567         right_nritems -= push_items;
2568         btrfs_set_header_nritems(right, right_nritems);
2569         push_space = BTRFS_LEAF_DATA_SIZE(root);
2570         for (i = 0; i < right_nritems; i++) {
2571                 item = btrfs_item_nr(right, i);
2572
2573                 if (!right->map_token) {
2574                         map_extent_buffer(right, (unsigned long)item,
2575                                         sizeof(struct btrfs_item),
2576                                         &right->map_token, &right->kaddr,
2577                                         &right->map_start, &right->map_len,
2578                                         KM_USER1);
2579                 }
2580
2581                 push_space = push_space - btrfs_item_size(right, item);
2582                 btrfs_set_item_offset(right, item, push_space);
2583         }
2584         if (right->map_token) {
2585                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2586                 right->map_token = NULL;
2587         }
2588
2589         btrfs_mark_buffer_dirty(left);
2590         if (right_nritems)
2591                 btrfs_mark_buffer_dirty(right);
2592         else
2593                 clean_tree_block(trans, root, right);
2594
2595         btrfs_item_key(right, &disk_key, 0);
2596         wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2597         if (wret)
2598                 ret = wret;
2599
2600         /* then fixup the leaf pointer in the path */
2601         if (path->slots[0] < push_items) {
2602                 path->slots[0] += old_left_nritems;
2603                 btrfs_tree_unlock(path->nodes[0]);
2604                 free_extent_buffer(path->nodes[0]);
2605                 path->nodes[0] = left;
2606                 path->slots[1] -= 1;
2607         } else {
2608                 btrfs_tree_unlock(left);
2609                 free_extent_buffer(left);
2610                 path->slots[0] -= push_items;
2611         }
2612         BUG_ON(path->slots[0] < 0);
2613         return ret;
2614 out:
2615         btrfs_tree_unlock(left);
2616         free_extent_buffer(left);
2617         return ret;
2618 }
2619
2620 /*
2621  * push some data in the path leaf to the left, trying to free up at
2622  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2623  *
2624  * max_slot can put a limit on how far into the leaf we'll push items.  The
2625  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
2626  * items
2627  */
2628 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2629                           *root, struct btrfs_path *path, int min_data_size,
2630                           int data_size, int empty, u32 max_slot)
2631 {
2632         struct extent_buffer *right = path->nodes[0];
2633         struct extent_buffer *left;
2634         int slot;
2635         int free_space;
2636         u32 right_nritems;
2637         int ret = 0;
2638
2639         slot = path->slots[1];
2640         if (slot == 0)
2641                 return 1;
2642         if (!path->nodes[1])
2643                 return 1;
2644
2645         right_nritems = btrfs_header_nritems(right);
2646         if (right_nritems == 0)
2647                 return 1;
2648
2649         btrfs_assert_tree_locked(path->nodes[1]);
2650
2651         left = read_node_slot(root, path->nodes[1], slot - 1);
2652         if (left == NULL)
2653                 return 1;
2654
2655         btrfs_tree_lock(left);
2656         btrfs_set_lock_blocking(left);
2657
2658         free_space = btrfs_leaf_free_space(root, left);
2659         if (free_space < data_size) {
2660                 ret = 1;
2661                 goto out;
2662         }
2663
2664         /* cow and double check */
2665         ret = btrfs_cow_block(trans, root, left,
2666                               path->nodes[1], slot - 1, &left);
2667         if (ret) {
2668                 /* we hit -ENOSPC, but it isn't fatal here */
2669                 ret = 1;
2670                 goto out;
2671         }
2672
2673         free_space = btrfs_leaf_free_space(root, left);
2674         if (free_space < data_size) {
2675                 ret = 1;
2676                 goto out;
2677         }
2678
2679         return __push_leaf_left(trans, root, path, min_data_size,
2680                                empty, left, free_space, right_nritems,
2681                                max_slot);
2682 out:
2683         btrfs_tree_unlock(left);
2684         free_extent_buffer(left);
2685         return ret;
2686 }
2687
2688 /*
2689  * split the path's leaf in two, making sure there is at least data_size
2690  * available for the resulting leaf level of the path.
2691  *
2692  * returns 0 if all went well and < 0 on failure.
2693  */
2694 static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2695                                struct btrfs_root *root,
2696                                struct btrfs_path *path,
2697                                struct extent_buffer *l,
2698                                struct extent_buffer *right,
2699                                int slot, int mid, int nritems)
2700 {
2701         int data_copy_size;
2702         int rt_data_off;
2703         int i;
2704         int ret = 0;
2705         int wret;
2706         struct btrfs_disk_key disk_key;
2707
2708         nritems = nritems - mid;
2709         btrfs_set_header_nritems(right, nritems);
2710         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2711
2712         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2713                            btrfs_item_nr_offset(mid),
2714                            nritems * sizeof(struct btrfs_item));
2715
2716         copy_extent_buffer(right, l,
2717                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2718                      data_copy_size, btrfs_leaf_data(l) +
2719                      leaf_data_end(root, l), data_copy_size);
2720
2721         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2722                       btrfs_item_end_nr(l, mid);
2723
2724         for (i = 0; i < nritems; i++) {
2725                 struct btrfs_item *item = btrfs_item_nr(right, i);
2726                 u32 ioff;
2727
2728                 if (!right->map_token) {
2729                         map_extent_buffer(right, (unsigned long)item,
2730                                         sizeof(struct btrfs_item),
2731                                         &right->map_token, &right->kaddr,
2732                                         &right->map_start, &right->map_len,
2733                                         KM_USER1);
2734                 }
2735
2736                 ioff = btrfs_item_offset(right, item);
2737                 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2738         }
2739
2740         if (right->map_token) {
2741                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2742                 right->map_token = NULL;
2743         }
2744
2745         btrfs_set_header_nritems(l, mid);
2746         ret = 0;
2747         btrfs_item_key(right, &disk_key, 0);
2748         wret = insert_ptr(trans, root, path, &disk_key, right->start,
2749                           path->slots[1] + 1, 1);
2750         if (wret)
2751                 ret = wret;
2752
2753         btrfs_mark_buffer_dirty(right);
2754         btrfs_mark_buffer_dirty(l);
2755         BUG_ON(path->slots[0] != slot);
2756
2757         if (mid <= slot) {
2758                 btrfs_tree_unlock(path->nodes[0]);
2759                 free_extent_buffer(path->nodes[0]);
2760                 path->nodes[0] = right;
2761                 path->slots[0] -= mid;
2762                 path->slots[1] += 1;
2763         } else {
2764                 btrfs_tree_unlock(right);
2765                 free_extent_buffer(right);
2766         }
2767
2768         BUG_ON(path->slots[0] < 0);
2769
2770         return ret;
2771 }
2772
2773 /*
2774  * double splits happen when we need to insert a big item in the middle
2775  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
2776  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
2777  *          A                 B                 C
2778  *
2779  * We avoid this by trying to push the items on either side of our target
2780  * into the adjacent leaves.  If all goes well we can avoid the double split
2781  * completely.
2782  */
2783 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
2784                                           struct btrfs_root *root,
2785                                           struct btrfs_path *path,
2786                                           int data_size)
2787 {
2788         int ret;
2789         int progress = 0;
2790         int slot;
2791         u32 nritems;
2792
2793         slot = path->slots[0];
2794
2795         /*
2796          * try to push all the items after our slot into the
2797          * right leaf
2798          */
2799         ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
2800         if (ret < 0)
2801                 return ret;
2802
2803         if (ret == 0)
2804                 progress++;
2805
2806         nritems = btrfs_header_nritems(path->nodes[0]);
2807         /*
2808          * our goal is to get our slot at the start or end of a leaf.  If
2809          * we've done so we're done
2810          */
2811         if (path->slots[0] == 0 || path->slots[0] == nritems)
2812                 return 0;
2813
2814         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2815                 return 0;
2816
2817         /* try to push all the items before our slot into the next leaf */
2818         slot = path->slots[0];
2819         ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
2820         if (ret < 0)
2821                 return ret;
2822
2823         if (ret == 0)
2824                 progress++;
2825
2826         if (progress)
2827                 return 0;
2828         return 1;
2829 }
2830
2831 /*
2832  * split the path's leaf in two, making sure there is at least data_size
2833  * available for the resulting leaf level of the path.
2834  *
2835  * returns 0 if all went well and < 0 on failure.
2836  */
2837 static noinline int split_leaf(struct btrfs_trans_handle *trans,
2838                                struct btrfs_root *root,
2839                                struct btrfs_key *ins_key,
2840                                struct btrfs_path *path, int data_size,
2841                                int extend)
2842 {
2843         struct btrfs_disk_key disk_key;
2844         struct extent_buffer *l;
2845         u32 nritems;
2846         int mid;
2847         int slot;
2848         struct extent_buffer *right;
2849         int ret = 0;
2850         int wret;
2851         int split;
2852         int num_doubles = 0;
2853         int tried_avoid_double = 0;
2854
2855         l = path->nodes[0];
2856         slot = path->slots[0];
2857         if (extend && data_size + btrfs_item_size_nr(l, slot) +
2858             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
2859                 return -EOVERFLOW;
2860
2861         /* first try to make some room by pushing left and right */
2862         if (data_size) {
2863                 wret = push_leaf_right(trans, root, path, data_size,
2864                                        data_size, 0, 0);
2865                 if (wret < 0)
2866                         return wret;
2867                 if (wret) {
2868                         wret = push_leaf_left(trans, root, path, data_size,
2869                                               data_size, 0, (u32)-1);
2870                         if (wret < 0)
2871                                 return wret;
2872                 }
2873                 l = path->nodes[0];
2874
2875                 /* did the pushes work? */
2876                 if (btrfs_leaf_free_space(root, l) >= data_size)
2877                         return 0;
2878         }
2879
2880         if (!path->nodes[1]) {
2881                 ret = insert_new_root(trans, root, path, 1);
2882                 if (ret)
2883                         return ret;
2884         }
2885 again:
2886         split = 1;
2887         l = path->nodes[0];
2888         slot = path->slots[0];
2889         nritems = btrfs_header_nritems(l);
2890         mid = (nritems + 1) / 2;
2891
2892         if (mid <= slot) {
2893                 if (nritems == 1 ||
2894                     leaf_space_used(l, mid, nritems - mid) + data_size >
2895                         BTRFS_LEAF_DATA_SIZE(root)) {
2896                         if (slot >= nritems) {
2897                                 split = 0;
2898                         } else {
2899                                 mid = slot;
2900                                 if (mid != nritems &&
2901                                     leaf_space_used(l, mid, nritems - mid) +
2902                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2903                                         if (data_size && !tried_avoid_double)
2904                                                 goto push_for_double;
2905                                         split = 2;
2906                                 }
2907                         }
2908                 }
2909         } else {
2910                 if (leaf_space_used(l, 0, mid) + data_size >
2911                         BTRFS_LEAF_DATA_SIZE(root)) {
2912                         if (!extend && data_size && slot == 0) {
2913                                 split = 0;
2914                         } else if ((extend || !data_size) && slot == 0) {
2915                                 mid = 1;
2916                         } else {
2917                                 mid = slot;
2918                                 if (mid != nritems &&
2919                                     leaf_space_used(l, mid, nritems - mid) +
2920                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2921                                         if (data_size && !tried_avoid_double)
2922                                                 goto push_for_double;
2923                                         split = 2 ;
2924                                 }
2925                         }
2926                 }
2927         }
2928
2929         if (split == 0)
2930                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2931         else
2932                 btrfs_item_key(l, &disk_key, mid);
2933
2934         right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2935                                         root->root_key.objectid,
2936                                         &disk_key, 0, l->start, 0);
2937         if (IS_ERR(right))
2938                 return PTR_ERR(right);
2939
2940         root_add_used(root, root->leafsize);
2941
2942         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2943         btrfs_set_header_bytenr(right, right->start);
2944         btrfs_set_header_generation(right, trans->transid);
2945         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
2946         btrfs_set_header_owner(right, root->root_key.objectid);
2947         btrfs_set_header_level(right, 0);
2948         write_extent_buffer(right, root->fs_info->fsid,
2949                             (unsigned long)btrfs_header_fsid(right),
2950                             BTRFS_FSID_SIZE);
2951
2952         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2953                             (unsigned long)btrfs_header_chunk_tree_uuid(right),
2954                             BTRFS_UUID_SIZE);
2955
2956         if (split == 0) {
2957                 if (mid <= slot) {
2958                         btrfs_set_header_nritems(right, 0);
2959                         wret = insert_ptr(trans, root, path,
2960                                           &disk_key, right->start,
2961                                           path->slots[1] + 1, 1);
2962                         if (wret)
2963                                 ret = wret;
2964
2965                         btrfs_tree_unlock(path->nodes[0]);
2966                         free_extent_buffer(path->nodes[0]);
2967                         path->nodes[0] = right;
2968                         path->slots[0] = 0;
2969                         path->slots[1] += 1;
2970                 } else {
2971                         btrfs_set_header_nritems(right, 0);
2972                         wret = insert_ptr(trans, root, path,
2973                                           &disk_key,
2974                                           right->start,
2975                                           path->slots[1], 1);
2976                         if (wret)
2977                                 ret = wret;
2978                         btrfs_tree_unlock(path->nodes[0]);
2979                         free_extent_buffer(path->nodes[0]);
2980                         path->nodes[0] = right;
2981                         path->slots[0] = 0;
2982                         if (path->slots[1] == 0) {
2983                                 wret = fixup_low_keys(trans, root,
2984                                                 path, &disk_key, 1);
2985                                 if (wret)
2986                                         ret = wret;
2987                         }
2988                 }
2989                 btrfs_mark_buffer_dirty(right);
2990                 return ret;
2991         }
2992
2993         ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
2994         BUG_ON(ret);
2995
2996         if (split == 2) {
2997                 BUG_ON(num_doubles != 0);
2998                 num_doubles++;
2999                 goto again;
3000         }
3001
3002         return ret;
3003
3004 push_for_double:
3005         push_for_double_split(trans, root, path, data_size);
3006         tried_avoid_double = 1;
3007         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3008                 return 0;
3009         goto again;
3010 }
3011
3012 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3013                                          struct btrfs_root *root,
3014                                          struct btrfs_path *path, int ins_len)
3015 {
3016         struct btrfs_key key;
3017         struct extent_buffer *leaf;
3018         struct btrfs_file_extent_item *fi;
3019         u64 extent_len = 0;
3020         u32 item_size;
3021         int ret;
3022
3023         leaf = path->nodes[0];
3024         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3025
3026         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3027                key.type != BTRFS_EXTENT_CSUM_KEY);
3028
3029         if (btrfs_leaf_free_space(root, leaf) >= ins_len)
3030                 return 0;
3031
3032         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3033         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3034                 fi = btrfs_item_ptr(leaf, path->slots[0],
3035                                     struct btrfs_file_extent_item);
3036                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3037         }
3038         btrfs_release_path(path);
3039
3040         path->keep_locks = 1;
3041         path->search_for_split = 1;
3042         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3043         path->search_for_split = 0;
3044         if (ret < 0)
3045                 goto err;
3046
3047         ret = -EAGAIN;
3048         leaf = path->nodes[0];
3049         /* if our item isn't there or got smaller, return now */
3050         if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3051                 goto err;
3052
3053         /* the leaf has  changed, it now has room.  return now */
3054         if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3055                 goto err;
3056
3057         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3058                 fi = btrfs_item_ptr(leaf, path->slots[0],
3059                                     struct btrfs_file_extent_item);
3060                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3061                         goto err;
3062         }
3063
3064         btrfs_set_path_blocking(path);
3065         ret = split_leaf(trans, root, &key, path, ins_len, 1);
3066         if (ret)
3067                 goto err;
3068
3069         path->keep_locks = 0;
3070         btrfs_unlock_up_safe(path, 1);
3071         return 0;
3072 err:
3073         path->keep_locks = 0;
3074         return ret;
3075 }
3076
3077 static noinline int split_item(struct btrfs_trans_handle *trans,
3078                                struct btrfs_root *root,
3079                                struct btrfs_path *path,
3080                                struct btrfs_key *new_key,
3081                                unsigned long split_offset)
3082 {
3083         struct extent_buffer *leaf;
3084         struct btrfs_item *item;
3085         struct btrfs_item *new_item;
3086         int slot;
3087         char *buf;
3088         u32 nritems;
3089         u32 item_size;
3090         u32 orig_offset;
3091         struct btrfs_disk_key disk_key;
3092
3093         leaf = path->nodes[0];
3094         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3095
3096         btrfs_set_path_blocking(path);
3097
3098         item = btrfs_item_nr(leaf, path->slots[0]);
3099         orig_offset = btrfs_item_offset(leaf, item);
3100         item_size = btrfs_item_size(leaf, item);
3101
3102         buf = kmalloc(item_size, GFP_NOFS);
3103         if (!buf)
3104                 return -ENOMEM;
3105
3106         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3107                             path->slots[0]), item_size);
3108
3109         slot = path->slots[0] + 1;
3110         nritems = btrfs_header_nritems(leaf);
3111         if (slot != nritems) {
3112                 /* shift the items */
3113                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3114                                 btrfs_item_nr_offset(slot),
3115                                 (nritems - slot) * sizeof(struct btrfs_item));
3116         }
3117
3118         btrfs_cpu_key_to_disk(&disk_key, new_key);
3119         btrfs_set_item_key(leaf, &disk_key, slot);
3120
3121         new_item = btrfs_item_nr(leaf, slot);
3122
3123         btrfs_set_item_offset(leaf, new_item, orig_offset);
3124         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3125
3126         btrfs_set_item_offset(leaf, item,
3127                               orig_offset + item_size - split_offset);
3128         btrfs_set_item_size(leaf, item, split_offset);
3129
3130         btrfs_set_header_nritems(leaf, nritems + 1);
3131
3132         /* write the data for the start of the original item */
3133         write_extent_buffer(leaf, buf,
3134                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3135                             split_offset);
3136
3137         /* write the data for the new item */
3138         write_extent_buffer(leaf, buf + split_offset,
3139                             btrfs_item_ptr_offset(leaf, slot),
3140                             item_size - split_offset);
3141         btrfs_mark_buffer_dirty(leaf);
3142
3143         BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
3144         kfree(buf);
3145         return 0;
3146 }
3147
3148 /*
3149  * This function splits a single item into two items,
3150  * giving 'new_key' to the new item and splitting the
3151  * old one at split_offset (from the start of the item).
3152  *
3153  * The path may be released by this operation.  After
3154  * the split, the path is pointing to the old item.  The
3155  * new item is going to be in the same node as the old one.
3156  *
3157  * Note, the item being split must be smaller enough to live alone on
3158  * a tree block with room for one extra struct btrfs_item
3159  *
3160  * This allows us to split the item in place, keeping a lock on the
3161  * leaf the entire time.
3162  */
3163 int btrfs_split_item(struct btrfs_trans_handle *trans,
3164                      struct btrfs_root *root,
3165                      struct btrfs_path *path,
3166                      struct btrfs_key *new_key,
3167                      unsigned long split_offset)
3168 {
3169         int ret;
3170         ret = setup_leaf_for_split(trans, root, path,
3171                                    sizeof(struct btrfs_item));
3172         if (ret)
3173                 return ret;
3174
3175         ret = split_item(trans, root, path, new_key, split_offset);
3176         return ret;
3177 }
3178
3179 /*
3180  * This function duplicate a item, giving 'new_key' to the new item.
3181  * It guarantees both items live in the same tree leaf and the new item
3182  * is contiguous with the original item.
3183  *
3184  * This allows us to split file extent in place, keeping a lock on the
3185  * leaf the entire time.
3186  */
3187 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3188                          struct btrfs_root *root,
3189                          struct btrfs_path *path,
3190                          struct btrfs_key *new_key)
3191 {
3192         struct extent_buffer *leaf;
3193         int ret;
3194         u32 item_size;
3195
3196         leaf = path->nodes[0];
3197         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3198         ret = setup_leaf_for_split(trans, root, path,
3199                                    item_size + sizeof(struct btrfs_item));
3200         if (ret)
3201                 return ret;
3202
3203         path->slots[0]++;
3204         ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
3205                                      item_size, item_size +
3206                                      sizeof(struct btrfs_item), 1);
3207         BUG_ON(ret);
3208
3209         leaf = path->nodes[0];
3210         memcpy_extent_buffer(leaf,
3211                              btrfs_item_ptr_offset(leaf, path->slots[0]),
3212                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3213                              item_size);
3214         return 0;
3215 }
3216
3217 /*
3218  * make the item pointed to by the path smaller.  new_size indicates
3219  * how small to make it, and from_end tells us if we just chop bytes
3220  * off the end of the item or if we shift the item to chop bytes off
3221  * the front.
3222  */
3223 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3224                         struct btrfs_root *root,
3225                         struct btrfs_path *path,
3226                         u32 new_size, int from_end)
3227 {
3228         int slot;
3229         struct extent_buffer *leaf;
3230         struct btrfs_item *item;
3231         u32 nritems;
3232         unsigned int data_end;
3233         unsigned int old_data_start;
3234         unsigned int old_size;
3235         unsigned int size_diff;
3236         int i;
3237
3238         leaf = path->nodes[0];
3239         slot = path->slots[0];
3240
3241         old_size = btrfs_item_size_nr(leaf, slot);
3242         if (old_size == new_size)
3243                 return 0;
3244
3245         nritems = btrfs_header_nritems(leaf);
3246         data_end = leaf_data_end(root, leaf);
3247
3248         old_data_start = btrfs_item_offset_nr(leaf, slot);
3249
3250         size_diff = old_size - new_size;
3251
3252         BUG_ON(slot < 0);
3253         BUG_ON(slot >= nritems);
3254
3255         /*
3256          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3257          */
3258         /* first correct the data pointers */
3259         for (i = slot; i < nritems; i++) {
3260                 u32 ioff;
3261                 item = btrfs_item_nr(leaf, i);
3262
3263                 if (!leaf->map_token) {
3264                         map_extent_buffer(leaf, (unsigned long)item,
3265                                         sizeof(struct btrfs_item),
3266                                         &leaf->map_token, &leaf->kaddr,
3267                                         &leaf->map_start, &leaf->map_len,
3268                                         KM_USER1);
3269                 }
3270
3271                 ioff = btrfs_item_offset(leaf, item);
3272                 btrfs_set_item_offset(leaf, item, ioff + size_diff);
3273         }
3274
3275         if (leaf->map_token) {
3276                 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3277                 leaf->map_token = NULL;
3278         }
3279
3280         /* shift the data */
3281         if (from_end) {
3282                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3283                               data_end + size_diff, btrfs_leaf_data(leaf) +
3284                               data_end, old_data_start + new_size - data_end);
3285         } else {
3286                 struct btrfs_disk_key disk_key;
3287                 u64 offset;
3288
3289                 btrfs_item_key(leaf, &disk_key, slot);
3290
3291                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3292                         unsigned long ptr;
3293                         struct btrfs_file_extent_item *fi;
3294
3295                         fi = btrfs_item_ptr(leaf, slot,
3296                                             struct btrfs_file_extent_item);
3297                         fi = (struct btrfs_file_extent_item *)(
3298                              (unsigned long)fi - size_diff);
3299
3300                         if (btrfs_file_extent_type(leaf, fi) ==
3301                             BTRFS_FILE_EXTENT_INLINE) {
3302                                 ptr = btrfs_item_ptr_offset(leaf, slot);
3303                                 memmove_extent_buffer(leaf, ptr,
3304                                       (unsigned long)fi,
3305                                       offsetof(struct btrfs_file_extent_item,
3306                                                  disk_bytenr));
3307                         }
3308                 }
3309
3310                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3311                               data_end + size_diff, btrfs_leaf_data(leaf) +
3312                               data_end, old_data_start - data_end);
3313
3314                 offset = btrfs_disk_key_offset(&disk_key);
3315                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3316                 btrfs_set_item_key(leaf, &disk_key, slot);
3317                 if (slot == 0)
3318                         fixup_low_keys(trans, root, path, &disk_key, 1);
3319         }
3320
3321         item = btrfs_item_nr(leaf, slot);
3322         btrfs_set_item_size(leaf, item, new_size);
3323         btrfs_mark_buffer_dirty(leaf);
3324
3325         if (btrfs_leaf_free_space(root, leaf) < 0) {
3326                 btrfs_print_leaf(root, leaf);
3327                 BUG();
3328         }
3329         return 0;
3330 }
3331
3332 /*
3333  * make the item pointed to by the path bigger, data_size is the new size.
3334  */
3335 int btrfs_extend_item(struct btrfs_trans_handle *trans,
3336                       struct btrfs_root *root, struct btrfs_path *path,
3337                       u32 data_size)
3338 {
3339         int slot;
3340         struct extent_buffer *leaf;
3341         struct btrfs_item *item;
3342         u32 nritems;
3343         unsigned int data_end;
3344         unsigned int old_data;
3345         unsigned int old_size;
3346         int i;
3347
3348         leaf = path->nodes[0];
3349
3350         nritems = btrfs_header_nritems(leaf);
3351         data_end = leaf_data_end(root, leaf);
3352
3353         if (btrfs_leaf_free_space(root, leaf) < data_size) {
3354                 btrfs_print_leaf(root, leaf);
3355                 BUG();
3356         }
3357         slot = path->slots[0];
3358         old_data = btrfs_item_end_nr(leaf, slot);
3359
3360         BUG_ON(slot < 0);
3361         if (slot >= nritems) {
3362                 btrfs_print_leaf(root, leaf);
3363                 printk(KERN_CRIT "slot %d too large, nritems %d\n",
3364                        slot, nritems);
3365                 BUG_ON(1);
3366         }
3367
3368         /*
3369          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3370          */
3371         /* first correct the data pointers */
3372         for (i = slot; i < nritems; i++) {
3373                 u32 ioff;
3374                 item = btrfs_item_nr(leaf, i);
3375
3376                 if (!leaf->map_token) {
3377                         map_extent_buffer(leaf, (unsigned long)item,
3378                                         sizeof(struct btrfs_item),
3379                                         &leaf->map_token, &leaf->kaddr,
3380                                         &leaf->map_start, &leaf->map_len,
3381                                         KM_USER1);
3382                 }
3383                 ioff = btrfs_item_offset(leaf, item);
3384                 btrfs_set_item_offset(leaf, item, ioff - data_size);
3385         }
3386
3387         if (leaf->map_token) {
3388                 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3389                 leaf->map_token = NULL;
3390         }
3391
3392         /* shift the data */
3393         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3394                       data_end - data_size, btrfs_leaf_data(leaf) +
3395                       data_end, old_data - data_end);
3396
3397         data_end = old_data;
3398         old_size = btrfs_item_size_nr(leaf, slot);
3399         item = btrfs_item_nr(leaf, slot);
3400         btrfs_set_item_size(leaf, item, old_size + data_size);
3401         btrfs_mark_buffer_dirty(leaf);
3402
3403         if (btrfs_leaf_free_space(root, leaf) < 0) {
3404                 btrfs_print_leaf(root, leaf);
3405                 BUG();
3406         }
3407         return 0;
3408 }
3409
3410 /*
3411  * Given a key and some data, insert items into the tree.
3412  * This does all the path init required, making room in the tree if needed.
3413  * Returns the number of keys that were inserted.
3414  */
3415 int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3416                             struct btrfs_root *root,
3417                             struct btrfs_path *path,
3418                             struct btrfs_key *cpu_key, u32 *data_size,
3419                             int nr)
3420 {
3421         struct extent_buffer *leaf;
3422         struct btrfs_item *item;
3423         int ret = 0;
3424         int slot;
3425         int i;
3426         u32 nritems;
3427         u32 total_data = 0;
3428         u32 total_size = 0;
3429         unsigned int data_end;
3430         struct btrfs_disk_key disk_key;
3431         struct btrfs_key found_key;
3432
3433         for (i = 0; i < nr; i++) {
3434                 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3435                     BTRFS_LEAF_DATA_SIZE(root)) {
3436                         break;
3437                         nr = i;
3438                 }
3439                 total_data += data_size[i];
3440                 total_size += data_size[i] + sizeof(struct btrfs_item);
3441         }
3442         BUG_ON(nr == 0);
3443
3444         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3445         if (ret == 0)
3446                 return -EEXIST;
3447         if (ret < 0)
3448                 goto out;
3449
3450         leaf = path->nodes[0];
3451
3452         nritems = btrfs_header_nritems(leaf);
3453         data_end = leaf_data_end(root, leaf);
3454
3455         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3456                 for (i = nr; i >= 0; i--) {
3457                         total_data -= data_size[i];
3458                         total_size -= data_size[i] + sizeof(struct btrfs_item);
3459                         if (total_size < btrfs_leaf_free_space(root, leaf))
3460                                 break;
3461                 }
3462                 nr = i;
3463         }
3464
3465         slot = path->slots[0];
3466         BUG_ON(slot < 0);
3467
3468         if (slot != nritems) {
3469                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3470
3471                 item = btrfs_item_nr(leaf, slot);
3472                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3473
3474                 /* figure out how many keys we can insert in here */
3475                 total_data = data_size[0];
3476                 for (i = 1; i < nr; i++) {
3477                         if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3478                                 break;
3479                         total_data += data_size[i];
3480                 }
3481                 nr = i;
3482
3483                 if (old_data < data_end) {
3484                         btrfs_print_leaf(root, leaf);
3485                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3486                                slot, old_data, data_end);
3487                         BUG_ON(1);
3488                 }
3489                 /*
3490                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3491                  */
3492                 /* first correct the data pointers */
3493                 WARN_ON(leaf->map_token);
3494                 for (i = slot; i < nritems; i++) {
3495                         u32 ioff;
3496
3497                         item = btrfs_item_nr(leaf, i);
3498                         if (!leaf->map_token) {
3499                                 map_extent_buffer(leaf, (unsigned long)item,
3500                                         sizeof(struct btrfs_item),
3501                                         &leaf->map_token, &leaf->kaddr,
3502                                         &leaf->map_start, &leaf->map_len,
3503                                         KM_USER1);
3504                         }
3505
3506                         ioff = btrfs_item_offset(leaf, item);
3507                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3508                 }
3509                 if (leaf->map_token) {
3510                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3511                         leaf->map_token = NULL;
3512                 }
3513
3514                 /* shift the items */
3515                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3516                               btrfs_item_nr_offset(slot),
3517                               (nritems - slot) * sizeof(struct btrfs_item));
3518
3519                 /* shift the data */
3520                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3521                               data_end - total_data, btrfs_leaf_data(leaf) +
3522                               data_end, old_data - data_end);
3523                 data_end = old_data;
3524         } else {
3525                 /*
3526                  * this sucks but it has to be done, if we are inserting at
3527                  * the end of the leaf only insert 1 of the items, since we
3528                  * have no way of knowing whats on the next leaf and we'd have
3529                  * to drop our current locks to figure it out
3530                  */
3531                 nr = 1;
3532         }
3533
3534         /* setup the item for the new data */
3535         for (i = 0; i < nr; i++) {
3536                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3537                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3538                 item = btrfs_item_nr(leaf, slot + i);
3539                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3540                 data_end -= data_size[i];
3541                 btrfs_set_item_size(leaf, item, data_size[i]);
3542         }
3543         btrfs_set_header_nritems(leaf, nritems + nr);
3544         btrfs_mark_buffer_dirty(leaf);
3545
3546         ret = 0;
3547         if (slot == 0) {
3548                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3549                 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3550         }
3551
3552         if (btrfs_leaf_free_space(root, leaf) < 0) {
3553                 btrfs_print_leaf(root, leaf);
3554                 BUG();
3555         }
3556 out:
3557         if (!ret)
3558                 ret = nr;
3559         return ret;
3560 }
3561
3562 /*
3563  * this is a helper for btrfs_insert_empty_items, the main goal here is
3564  * to save stack depth by doing the bulk of the work in a function
3565  * that doesn't call btrfs_search_slot
3566  */
3567 int setup_items_for_insert(struct btrfs_trans_handle *trans,
3568                            struct btrfs_root *root, struct btrfs_path *path,
3569                            struct btrfs_key *cpu_key, u32 *data_size,
3570                            u32 total_data, u32 total_size, int nr)
3571 {
3572         struct btrfs_item *item;
3573         int i;
3574         u32 nritems;
3575         unsigned int data_end;
3576         struct btrfs_disk_key disk_key;
3577         int ret;
3578         struct extent_buffer *leaf;
3579         int slot;
3580
3581         leaf = path->nodes[0];
3582         slot = path->slots[0];
3583
3584         nritems = btrfs_header_nritems(leaf);
3585         data_end = leaf_data_end(root, leaf);
3586
3587         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3588                 btrfs_print_leaf(root, leaf);
3589                 printk(KERN_CRIT "not enough freespace need %u have %d\n",
3590                        total_size, btrfs_leaf_free_space(root, leaf));
3591                 BUG();
3592         }
3593
3594         if (slot != nritems) {
3595                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3596
3597                 if (old_data < data_end) {
3598                         btrfs_print_leaf(root, leaf);
3599                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3600                                slot, old_data, data_end);
3601                         BUG_ON(1);
3602                 }
3603                 /*
3604                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3605                  */
3606                 /* first correct the data pointers */
3607                 WARN_ON(leaf->map_token);
3608                 for (i = slot; i < nritems; i++) {
3609                         u32 ioff;
3610
3611                         item = btrfs_item_nr(leaf, i);
3612                         if (!leaf->map_token) {
3613                                 map_extent_buffer(leaf, (unsigned long)item,
3614                                         sizeof(struct btrfs_item),
3615                                         &leaf->map_token, &leaf->kaddr,
3616                                         &leaf->map_start, &leaf->map_len,
3617                                         KM_USER1);
3618                         }
3619
3620                         ioff = btrfs_item_offset(leaf, item);
3621                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3622                 }
3623                 if (leaf->map_token) {
3624                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3625                         leaf->map_token = NULL;
3626                 }
3627
3628                 /* shift the items */
3629                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3630                               btrfs_item_nr_offset(slot),
3631                               (nritems - slot) * sizeof(struct btrfs_item));
3632
3633                 /* shift the data */
3634                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3635                               data_end - total_data, btrfs_leaf_data(leaf) +
3636                               data_end, old_data - data_end);
3637                 data_end = old_data;
3638         }
3639
3640         /* setup the item for the new data */
3641         for (i = 0; i < nr; i++) {
3642                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3643                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3644                 item = btrfs_item_nr(leaf, slot + i);
3645                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3646                 data_end -= data_size[i];
3647                 btrfs_set_item_size(leaf, item, data_size[i]);
3648         }
3649
3650         btrfs_set_header_nritems(leaf, nritems + nr);
3651
3652         ret = 0;
3653         if (slot == 0) {
3654                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3655                 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3656         }
3657         btrfs_unlock_up_safe(path, 1);
3658         btrfs_mark_buffer_dirty(leaf);
3659
3660         if (btrfs_leaf_free_space(root, leaf) < 0) {
3661                 btrfs_print_leaf(root, leaf);
3662                 BUG();
3663         }
3664         return ret;
3665 }
3666
3667 /*
3668  * Given a key and some data, insert items into the tree.
3669  * This does all the path init required, making room in the tree if needed.
3670  */
3671 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3672                             struct btrfs_root *root,
3673                             struct btrfs_path *path,
3674                             struct btrfs_key *cpu_key, u32 *data_size,
3675                             int nr)
3676 {
3677         int ret = 0;
3678         int slot;
3679         int i;
3680         u32 total_size = 0;
3681         u32 total_data = 0;
3682
3683         for (i = 0; i < nr; i++)
3684                 total_data += data_size[i];
3685
3686         total_size = total_data + (nr * sizeof(struct btrfs_item));
3687         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3688         if (ret == 0)
3689                 return -EEXIST;
3690         if (ret < 0)
3691                 goto out;
3692
3693         slot = path->slots[0];
3694         BUG_ON(slot < 0);
3695
3696         ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
3697                                total_data, total_size, nr);
3698
3699 out:
3700         return ret;
3701 }
3702
3703 /*
3704  * Given a key and some data, insert an item into the tree.
3705  * This does all the path init required, making room in the tree if needed.
3706  */
3707 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3708                       *root, struct btrfs_key *cpu_key, void *data, u32
3709                       data_size)
3710 {
3711         int ret = 0;
3712         struct btrfs_path *path;
3713         struct extent_buffer *leaf;
3714         unsigned long ptr;
3715
3716         path = btrfs_alloc_path();
3717         if (!path)
3718                 return -ENOMEM;
3719         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3720         if (!ret) {
3721                 leaf = path->nodes[0];
3722                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3723                 write_extent_buffer(leaf, data, ptr, data_size);
3724                 btrfs_mark_buffer_dirty(leaf);
3725         }
3726         btrfs_free_path(path);
3727         return ret;
3728 }
3729
3730 /*
3731  * delete the pointer from a given node.
3732  *
3733  * the tree should have been previously balanced so the deletion does not
3734  * empty a node.
3735  */
3736 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3737                    struct btrfs_path *path, int level, int slot)
3738 {
3739         struct extent_buffer *parent = path->nodes[level];
3740         u32 nritems;
3741         int ret = 0;
3742         int wret;
3743
3744         nritems = btrfs_header_nritems(parent);
3745         if (slot != nritems - 1) {
3746                 memmove_extent_buffer(parent,
3747                               btrfs_node_key_ptr_offset(slot),
3748                               btrfs_node_key_ptr_offset(slot + 1),
3749                               sizeof(struct btrfs_key_ptr) *
3750                               (nritems - slot - 1));
3751         }
3752         nritems--;
3753         btrfs_set_header_nritems(parent, nritems);
3754         if (nritems == 0 && parent == root->node) {
3755                 BUG_ON(btrfs_header_level(root->node) != 1);
3756                 /* just turn the root into a leaf and break */
3757                 btrfs_set_header_level(root->node, 0);
3758         } else if (slot == 0) {
3759                 struct btrfs_disk_key disk_key;
3760
3761                 btrfs_node_key(parent, &disk_key, 0);
3762                 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3763                 if (wret)
3764                         ret = wret;
3765         }
3766         btrfs_mark_buffer_dirty(parent);
3767         return ret;
3768 }
3769
3770 /*
3771  * a helper function to delete the leaf pointed to by path->slots[1] and
3772  * path->nodes[1].
3773  *
3774  * This deletes the pointer in path->nodes[1] and frees the leaf
3775  * block extent.  zero is returned if it all worked out, < 0 otherwise.
3776  *
3777  * The path must have already been setup for deleting the leaf, including
3778  * all the proper balancing.  path->nodes[1] must be locked.
3779  */
3780 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3781                                    struct btrfs_root *root,
3782                                    struct btrfs_path *path,
3783                                    struct extent_buffer *leaf)
3784 {
3785         int ret;
3786
3787         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3788         ret = del_ptr(trans, root, path, 1, path->slots[1]);
3789         if (ret)
3790                 return ret;
3791
3792         /*
3793          * btrfs_free_extent is expensive, we want to make sure we
3794          * aren't holding any locks when we call it
3795          */
3796         btrfs_unlock_up_safe(path, 0);
3797
3798         root_sub_used(root, leaf->len);
3799
3800         btrfs_free_tree_block(trans, root, leaf, 0, 1);
3801         return 0;
3802 }
3803 /*
3804  * delete the item at the leaf level in path.  If that empties
3805  * the leaf, remove it from the tree
3806  */
3807 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3808                     struct btrfs_path *path, int slot, int nr)
3809 {
3810         struct extent_buffer *leaf;
3811         struct btrfs_item *item;
3812         int last_off;
3813         int dsize = 0;
3814         int ret = 0;
3815         int wret;
3816         int i;
3817         u32 nritems;
3818
3819         leaf = path->nodes[0];
3820         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3821
3822         for (i = 0; i < nr; i++)
3823                 dsize += btrfs_item_size_nr(leaf, slot + i);
3824
3825         nritems = btrfs_header_nritems(leaf);
3826
3827         if (slot + nr != nritems) {
3828                 int data_end = leaf_data_end(root, leaf);
3829
3830                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3831                               data_end + dsize,
3832                               btrfs_leaf_data(leaf) + data_end,
3833                               last_off - data_end);
3834
3835                 for (i = slot + nr; i < nritems; i++) {
3836                         u32 ioff;
3837
3838                         item = btrfs_item_nr(leaf, i);
3839                         if (!leaf->map_token) {
3840                                 map_extent_buffer(leaf, (unsigned long)item,
3841                                         sizeof(struct btrfs_item),
3842                                         &leaf->map_token, &leaf->kaddr,
3843                                         &leaf->map_start, &leaf->map_len,
3844                                         KM_USER1);
3845                         }
3846                         ioff = btrfs_item_offset(leaf, item);
3847                         btrfs_set_item_offset(leaf, item, ioff + dsize);
3848                 }
3849
3850                 if (leaf->map_token) {
3851                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3852                         leaf->map_token = NULL;
3853                 }
3854
3855                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3856                               btrfs_item_nr_offset(slot + nr),
3857                               sizeof(struct btrfs_item) *
3858                               (nritems - slot - nr));
3859         }
3860         btrfs_set_header_nritems(leaf, nritems - nr);
3861         nritems -= nr;
3862
3863         /* delete the leaf if we've emptied it */
3864         if (nritems == 0) {
3865                 if (leaf == root->node) {
3866                         btrfs_set_header_level(leaf, 0);
3867                 } else {
3868                         btrfs_set_path_blocking(path);
3869                         clean_tree_block(trans, root, leaf);
3870                         ret = btrfs_del_leaf(trans, root, path, leaf);
3871                         BUG_ON(ret);
3872                 }
3873         } else {
3874                 int used = leaf_space_used(leaf, 0, nritems);
3875                 if (slot == 0) {
3876                         struct btrfs_disk_key disk_key;
3877
3878                         btrfs_item_key(leaf, &disk_key, 0);
3879                         wret = fixup_low_keys(trans, root, path,
3880                                               &disk_key, 1);
3881                         if (wret)
3882                                 ret = wret;
3883                 }
3884
3885                 /* delete the leaf if it is mostly empty */
3886                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
3887                         /* push_leaf_left fixes the path.
3888                          * make sure the path still points to our leaf
3889                          * for possible call to del_ptr below
3890                          */
3891                         slot = path->slots[1];
3892                         extent_buffer_get(leaf);
3893
3894                         btrfs_set_path_blocking(path);
3895                         wret = push_leaf_left(trans, root, path, 1, 1,
3896                                               1, (u32)-1);
3897                         if (wret < 0 && wret != -ENOSPC)
3898                                 ret = wret;
3899
3900                         if (path->nodes[0] == leaf &&
3901                             btrfs_header_nritems(leaf)) {
3902                                 wret = push_leaf_right(trans, root, path, 1,
3903                                                        1, 1, 0);
3904                                 if (wret < 0 && wret != -ENOSPC)
3905                                         ret = wret;
3906                         }
3907
3908                         if (btrfs_header_nritems(leaf) == 0) {
3909                                 path->slots[1] = slot;
3910                                 ret = btrfs_del_leaf(trans, root, path, leaf);
3911                                 BUG_ON(ret);
3912                                 free_extent_buffer(leaf);
3913                         } else {
3914                                 /* if we're still in the path, make sure
3915                                  * we're dirty.  Otherwise, one of the
3916                                  * push_leaf functions must have already
3917                                  * dirtied this buffer
3918                                  */
3919                                 if (path->nodes[0] == leaf)
3920                                         btrfs_mark_buffer_dirty(leaf);
3921                                 free_extent_buffer(leaf);
3922                         }
3923                 } else {
3924                         btrfs_mark_buffer_dirty(leaf);
3925                 }
3926         }
3927         return ret;
3928 }
3929
3930 /*
3931  * search the tree again to find a leaf with lesser keys
3932  * returns 0 if it found something or 1 if there are no lesser leaves.
3933  * returns < 0 on io errors.
3934  *
3935  * This may release the path, and so you may lose any locks held at the
3936  * time you call it.
3937  */
3938 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3939 {
3940         struct btrfs_key key;
3941         struct btrfs_disk_key found_key;
3942         int ret;
3943
3944         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3945
3946         if (key.offset > 0)
3947                 key.offset--;
3948         else if (key.type > 0)
3949                 key.type--;
3950         else if (key.objectid > 0)
3951                 key.objectid--;
3952         else
3953                 return 1;
3954
3955         btrfs_release_path(path);
3956         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3957         if (ret < 0)
3958                 return ret;
3959         btrfs_item_key(path->nodes[0], &found_key, 0);
3960         ret = comp_keys(&found_key, &key);
3961         if (ret < 0)
3962                 return 0;
3963         return 1;
3964 }
3965
3966 /*
3967  * A helper function to walk down the tree starting at min_key, and looking
3968  * for nodes or leaves that are either in cache or have a minimum
3969  * transaction id.  This is used by the btree defrag code, and tree logging
3970  *
3971  * This does not cow, but it does stuff the starting key it finds back
3972  * into min_key, so you can call btrfs_search_slot with cow=1 on the
3973  * key and get a writable path.
3974  *
3975  * This does lock as it descends, and path->keep_locks should be set
3976  * to 1 by the caller.
3977  *
3978  * This honors path->lowest_level to prevent descent past a given level
3979  * of the tree.
3980  *
3981  * min_trans indicates the oldest transaction that you are interested
3982  * in walking through.  Any nodes or leaves older than min_trans are
3983  * skipped over (without reading them).
3984  *
3985  * returns zero if something useful was found, < 0 on error and 1 if there
3986  * was nothing in the tree that matched the search criteria.
3987  */
3988 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3989                          struct btrfs_key *max_key,
3990                          struct btrfs_path *path, int cache_only,
3991                          u64 min_trans)
3992 {
3993         struct extent_buffer *cur;
3994         struct btrfs_key found_key;
3995         int slot;
3996         int sret;
3997         u32 nritems;
3998         int level;
3999         int ret = 1;
4000
4001         WARN_ON(!path->keep_locks);
4002 again:
4003         cur = btrfs_lock_root_node(root);
4004         level = btrfs_header_level(cur);
4005         WARN_ON(path->nodes[level]);
4006         path->nodes[level] = cur;
4007         path->locks[level] = 1;
4008
4009         if (btrfs_header_generation(cur) < min_trans) {
4010                 ret = 1;
4011                 goto out;
4012         }
4013         while (1) {
4014                 nritems = btrfs_header_nritems(cur);
4015                 level = btrfs_header_level(cur);
4016                 sret = bin_search(cur, min_key, level, &slot);
4017
4018                 /* at the lowest level, we're done, setup the path and exit */
4019                 if (level == path->lowest_level) {
4020                         if (slot >= nritems)
4021                                 goto find_next_key;
4022                         ret = 0;
4023                         path->slots[level] = slot;
4024                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4025                         goto out;
4026                 }
4027                 if (sret && slot > 0)
4028                         slot--;
4029                 /*
4030                  * check this node pointer against the cache_only and
4031                  * min_trans parameters.  If it isn't in cache or is too
4032                  * old, skip to the next one.
4033                  */
4034                 while (slot < nritems) {
4035                         u64 blockptr;
4036                         u64 gen;
4037                         struct extent_buffer *tmp;
4038                         struct btrfs_disk_key disk_key;
4039
4040                         blockptr = btrfs_node_blockptr(cur, slot);
4041                         gen = btrfs_node_ptr_generation(cur, slot);
4042                         if (gen < min_trans) {
4043                                 slot++;
4044                                 continue;
4045                         }
4046                         if (!cache_only)
4047                                 break;
4048
4049                         if (max_key) {
4050                                 btrfs_node_key(cur, &disk_key, slot);
4051                                 if (comp_keys(&disk_key, max_key) >= 0) {
4052                                         ret = 1;
4053                                         goto out;
4054                                 }
4055                         }
4056
4057                         tmp = btrfs_find_tree_block(root, blockptr,
4058                                             btrfs_level_size(root, level - 1));
4059
4060                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
4061                                 free_extent_buffer(tmp);
4062                                 break;
4063                         }
4064                         if (tmp)
4065                                 free_extent_buffer(tmp);
4066                         slot++;
4067                 }
4068 find_next_key:
4069                 /*
4070                  * we didn't find a candidate key in this node, walk forward
4071                  * and find another one
4072                  */
4073                 if (slot >= nritems) {
4074                         path->slots[level] = slot;
4075                         btrfs_set_path_blocking(path);
4076                         sret = btrfs_find_next_key(root, path, min_key, level,
4077                                                   cache_only, min_trans);
4078                         if (sret == 0) {
4079                                 btrfs_release_path(path);
4080                                 goto again;
4081                         } else {
4082                                 goto out;
4083                         }
4084                 }
4085                 /* save our key for returning back */
4086                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4087                 path->slots[level] = slot;
4088                 if (level == path->lowest_level) {
4089                         ret = 0;
4090                         unlock_up(path, level, 1);
4091                         goto out;
4092                 }
4093                 btrfs_set_path_blocking(path);
4094                 cur = read_node_slot(root, cur, slot);
4095                 BUG_ON(!cur);
4096
4097                 btrfs_tree_lock(cur);
4098
4099                 path->locks[level - 1] = 1;
4100                 path->nodes[level - 1] = cur;
4101                 unlock_up(path, level, 1);
4102                 btrfs_clear_path_blocking(path, NULL);
4103         }
4104 out:
4105         if (ret == 0)
4106                 memcpy(min_key, &found_key, sizeof(found_key));
4107         btrfs_set_path_blocking(path);
4108         return ret;
4109 }
4110
4111 /*
4112  * this is similar to btrfs_next_leaf, but does not try to preserve
4113  * and fixup the path.  It looks for and returns the next key in the
4114  * tree based on the current path and the cache_only and min_trans
4115  * parameters.
4116  *
4117  * 0 is returned if another key is found, < 0 if there are any errors
4118  * and 1 is returned if there are no higher keys in the tree
4119  *
4120  * path->keep_locks should be set to 1 on the search made before
4121  * calling this function.
4122  */
4123 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4124                         struct btrfs_key *key, int level,
4125                         int cache_only, u64 min_trans)
4126 {
4127         int slot;
4128         struct extent_buffer *c;
4129
4130         WARN_ON(!path->keep_locks);
4131         while (level < BTRFS_MAX_LEVEL) {
4132                 if (!path->nodes[level])
4133                         return 1;
4134
4135                 slot = path->slots[level] + 1;
4136                 c = path->nodes[level];
4137 next:
4138                 if (slot >= btrfs_header_nritems(c)) {
4139                         int ret;
4140                         int orig_lowest;
4141                         struct btrfs_key cur_key;
4142                         if (level + 1 >= BTRFS_MAX_LEVEL ||
4143                             !path->nodes[level + 1])
4144                                 return 1;
4145
4146                         if (path->locks[level + 1]) {
4147                                 level++;
4148                                 continue;
4149                         }
4150
4151                         slot = btrfs_header_nritems(c) - 1;
4152                         if (level == 0)
4153                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
4154                         else
4155                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
4156
4157                         orig_lowest = path->lowest_level;
4158                         btrfs_release_path(path);
4159                         path->lowest_level = level;
4160                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
4161                                                 0, 0);
4162                         path->lowest_level = orig_lowest;
4163                         if (ret < 0)
4164                                 return ret;
4165
4166                         c = path->nodes[level];
4167                         slot = path->slots[level];
4168                         if (ret == 0)
4169                                 slot++;
4170                         goto next;
4171                 }
4172
4173                 if (level == 0)
4174                         btrfs_item_key_to_cpu(c, key, slot);
4175                 else {
4176                         u64 blockptr = btrfs_node_blockptr(c, slot);
4177                         u64 gen = btrfs_node_ptr_generation(c, slot);
4178
4179                         if (cache_only) {
4180                                 struct extent_buffer *cur;
4181                                 cur = btrfs_find_tree_block(root, blockptr,
4182                                             btrfs_level_size(root, level - 1));
4183                                 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
4184                                         slot++;
4185                                         if (cur)
4186                                                 free_extent_buffer(cur);
4187                                         goto next;
4188                                 }
4189                                 free_extent_buffer(cur);
4190                         }
4191                         if (gen < min_trans) {
4192                                 slot++;
4193                                 goto next;
4194                         }
4195                         btrfs_node_key_to_cpu(c, key, slot);
4196                 }
4197                 return 0;
4198         }
4199         return 1;
4200 }
4201
4202 /*
4203  * search the tree again to find a leaf with greater keys
4204  * returns 0 if it found something or 1 if there are no greater leaves.
4205  * returns < 0 on io errors.
4206  */
4207 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4208 {
4209         int slot;
4210         int level;
4211         struct extent_buffer *c;
4212         struct extent_buffer *next;
4213         struct btrfs_key key;
4214         u32 nritems;
4215         int ret;
4216         int old_spinning = path->leave_spinning;
4217         int force_blocking = 0;
4218
4219         nritems = btrfs_header_nritems(path->nodes[0]);
4220         if (nritems == 0)
4221                 return 1;
4222
4223         /*
4224          * we take the blocks in an order that upsets lockdep.  Using
4225          * blocking mode is the only way around it.
4226          */
4227 #ifdef CONFIG_DEBUG_LOCK_ALLOC
4228         force_blocking = 1;
4229 #endif
4230
4231         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4232 again:
4233         level = 1;
4234         next = NULL;
4235         btrfs_release_path(path);
4236
4237         path->keep_locks = 1;
4238
4239         if (!force_blocking)
4240                 path->leave_spinning = 1;
4241
4242         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4243         path->keep_locks = 0;
4244
4245         if (ret < 0)
4246                 return ret;
4247
4248         nritems = btrfs_header_nritems(path->nodes[0]);
4249         /*
4250          * by releasing the path above we dropped all our locks.  A balance
4251          * could have added more items next to the key that used to be
4252          * at the very end of the block.  So, check again here and
4253          * advance the path if there are now more items available.
4254          */
4255         if (nritems > 0 && path->slots[0] < nritems - 1) {
4256                 if (ret == 0)
4257                         path->slots[0]++;
4258                 ret = 0;
4259                 goto done;
4260         }
4261
4262         while (level < BTRFS_MAX_LEVEL) {
4263                 if (!path->nodes[level]) {
4264                         ret = 1;
4265                         goto done;
4266                 }
4267
4268                 slot = path->slots[level] + 1;
4269                 c = path->nodes[level];
4270                 if (slot >= btrfs_header_nritems(c)) {
4271                         level++;
4272                         if (level == BTRFS_MAX_LEVEL) {
4273                                 ret = 1;
4274                                 goto done;
4275                         }
4276                         continue;
4277                 }
4278
4279                 if (next) {
4280                         btrfs_tree_unlock(next);
4281                         free_extent_buffer(next);
4282                 }
4283
4284                 next = c;
4285                 ret = read_block_for_search(NULL, root, path, &next, level,
4286                                             slot, &key);
4287                 if (ret == -EAGAIN)
4288                         goto again;
4289
4290                 if (ret < 0) {
4291                         btrfs_release_path(path);
4292                         goto done;
4293                 }
4294
4295                 if (!path->skip_locking) {
4296                         ret = btrfs_try_spin_lock(next);
4297                         if (!ret) {
4298                                 btrfs_set_path_blocking(path);
4299                                 btrfs_tree_lock(next);
4300                                 if (!force_blocking)
4301                                         btrfs_clear_path_blocking(path, next);
4302                         }
4303                         if (force_blocking)
4304                                 btrfs_set_lock_blocking(next);
4305                 }
4306                 break;
4307         }
4308         path->slots[level] = slot;
4309         while (1) {
4310                 level--;
4311                 c = path->nodes[level];
4312                 if (path->locks[level])
4313                         btrfs_tree_unlock(c);
4314
4315                 free_extent_buffer(c);
4316                 path->nodes[level] = next;
4317                 path->slots[level] = 0;
4318                 if (!path->skip_locking)
4319                         path->locks[level] = 1;
4320
4321                 if (!level)
4322                         break;
4323
4324                 ret = read_block_for_search(NULL, root, path, &next, level,
4325                                             0, &key);
4326                 if (ret == -EAGAIN)
4327                         goto again;
4328
4329                 if (ret < 0) {
4330                         btrfs_release_path(path);
4331                         goto done;
4332                 }
4333
4334                 if (!path->skip_locking) {
4335                         btrfs_assert_tree_locked(path->nodes[level]);
4336                         ret = btrfs_try_spin_lock(next);
4337                         if (!ret) {
4338                                 btrfs_set_path_blocking(path);
4339                                 btrfs_tree_lock(next);
4340                                 if (!force_blocking)
4341                                         btrfs_clear_path_blocking(path, next);
4342                         }
4343                         if (force_blocking)
4344                                 btrfs_set_lock_blocking(next);
4345                 }
4346         }
4347         ret = 0;
4348 done:
4349         unlock_up(path, 0, 1);
4350         path->leave_spinning = old_spinning;
4351         if (!old_spinning)
4352                 btrfs_set_path_blocking(path);
4353
4354         return ret;
4355 }
4356
4357 /*
4358  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4359  * searching until it gets past min_objectid or finds an item of 'type'
4360  *
4361  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4362  */
4363 int btrfs_previous_item(struct btrfs_root *root,
4364                         struct btrfs_path *path, u64 min_objectid,
4365                         int type)
4366 {
4367         struct btrfs_key found_key;
4368         struct extent_buffer *leaf;
4369         u32 nritems;
4370         int ret;
4371
4372         while (1) {
4373                 if (path->slots[0] == 0) {
4374                         btrfs_set_path_blocking(path);
4375                         ret = btrfs_prev_leaf(root, path);
4376                         if (ret != 0)
4377                                 return ret;
4378                 } else {
4379                         path->slots[0]--;
4380                 }
4381                 leaf = path->nodes[0];
4382                 nritems = btrfs_header_nritems(leaf);
4383                 if (nritems == 0)
4384                         return 1;
4385                 if (path->slots[0] == nritems)
4386                         path->slots[0]--;
4387
4388                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4389                 if (found_key.objectid < min_objectid)
4390                         break;
4391                 if (found_key.type == type)
4392                         return 0;
4393                 if (found_key.objectid == min_objectid &&
4394                     found_key.type < type)
4395                         break;
4396         }
4397         return 1;
4398 }