Merge branch 'urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/rric/oprofile...
[pandora-kernel.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include "ctree.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "print-tree.h"
25 #include "locking.h"
26
27 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
28                       *root, struct btrfs_path *path, int level);
29 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
30                       *root, struct btrfs_key *ins_key,
31                       struct btrfs_path *path, int data_size, int extend);
32 static int push_node_left(struct btrfs_trans_handle *trans,
33                           struct btrfs_root *root, struct extent_buffer *dst,
34                           struct extent_buffer *src, int empty);
35 static int balance_node_right(struct btrfs_trans_handle *trans,
36                               struct btrfs_root *root,
37                               struct extent_buffer *dst_buf,
38                               struct extent_buffer *src_buf);
39 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
40                    struct btrfs_path *path, int level, int slot);
41
42 struct btrfs_path *btrfs_alloc_path(void)
43 {
44         struct btrfs_path *path;
45         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
46         if (path)
47                 path->reada = 1;
48         return path;
49 }
50
51 /*
52  * set all locked nodes in the path to blocking locks.  This should
53  * be done before scheduling
54  */
55 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
56 {
57         int i;
58         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
59                 if (p->nodes[i] && p->locks[i])
60                         btrfs_set_lock_blocking(p->nodes[i]);
61         }
62 }
63
64 /*
65  * reset all the locked nodes in the patch to spinning locks.
66  *
67  * held is used to keep lockdep happy, when lockdep is enabled
68  * we set held to a blocking lock before we go around and
69  * retake all the spinlocks in the path.  You can safely use NULL
70  * for held
71  */
72 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
73                                         struct extent_buffer *held)
74 {
75         int i;
76
77 #ifdef CONFIG_DEBUG_LOCK_ALLOC
78         /* lockdep really cares that we take all of these spinlocks
79          * in the right order.  If any of the locks in the path are not
80          * currently blocking, it is going to complain.  So, make really
81          * really sure by forcing the path to blocking before we clear
82          * the path blocking.
83          */
84         if (held)
85                 btrfs_set_lock_blocking(held);
86         btrfs_set_path_blocking(p);
87 #endif
88
89         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
90                 if (p->nodes[i] && p->locks[i])
91                         btrfs_clear_lock_blocking(p->nodes[i]);
92         }
93
94 #ifdef CONFIG_DEBUG_LOCK_ALLOC
95         if (held)
96                 btrfs_clear_lock_blocking(held);
97 #endif
98 }
99
100 /* this also releases the path */
101 void btrfs_free_path(struct btrfs_path *p)
102 {
103         if (!p)
104                 return;
105         btrfs_release_path(p);
106         kmem_cache_free(btrfs_path_cachep, p);
107 }
108
109 /*
110  * path release drops references on the extent buffers in the path
111  * and it drops any locks held by this path
112  *
113  * It is safe to call this on paths that no locks or extent buffers held.
114  */
115 noinline void btrfs_release_path(struct btrfs_path *p)
116 {
117         int i;
118
119         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
120                 p->slots[i] = 0;
121                 if (!p->nodes[i])
122                         continue;
123                 if (p->locks[i]) {
124                         btrfs_tree_unlock(p->nodes[i]);
125                         p->locks[i] = 0;
126                 }
127                 free_extent_buffer(p->nodes[i]);
128                 p->nodes[i] = NULL;
129         }
130 }
131
132 /*
133  * safely gets a reference on the root node of a tree.  A lock
134  * is not taken, so a concurrent writer may put a different node
135  * at the root of the tree.  See btrfs_lock_root_node for the
136  * looping required.
137  *
138  * The extent buffer returned by this has a reference taken, so
139  * it won't disappear.  It may stop being the root of the tree
140  * at any time because there are no locks held.
141  */
142 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
143 {
144         struct extent_buffer *eb;
145
146         rcu_read_lock();
147         eb = rcu_dereference(root->node);
148         extent_buffer_get(eb);
149         rcu_read_unlock();
150         return eb;
151 }
152
153 /* loop around taking references on and locking the root node of the
154  * tree until you end up with a lock on the root.  A locked buffer
155  * is returned, with a reference held.
156  */
157 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
158 {
159         struct extent_buffer *eb;
160
161         while (1) {
162                 eb = btrfs_root_node(root);
163                 btrfs_tree_lock(eb);
164                 if (eb == root->node)
165                         break;
166                 btrfs_tree_unlock(eb);
167                 free_extent_buffer(eb);
168         }
169         return eb;
170 }
171
172 /* cowonly root (everything not a reference counted cow subvolume), just get
173  * put onto a simple dirty list.  transaction.c walks this to make sure they
174  * get properly updated on disk.
175  */
176 static void add_root_to_dirty_list(struct btrfs_root *root)
177 {
178         if (root->track_dirty && list_empty(&root->dirty_list)) {
179                 list_add(&root->dirty_list,
180                          &root->fs_info->dirty_cowonly_roots);
181         }
182 }
183
184 /*
185  * used by snapshot creation to make a copy of a root for a tree with
186  * a given objectid.  The buffer with the new root node is returned in
187  * cow_ret, and this func returns zero on success or a negative error code.
188  */
189 int btrfs_copy_root(struct btrfs_trans_handle *trans,
190                       struct btrfs_root *root,
191                       struct extent_buffer *buf,
192                       struct extent_buffer **cow_ret, u64 new_root_objectid)
193 {
194         struct extent_buffer *cow;
195         int ret = 0;
196         int level;
197         struct btrfs_disk_key disk_key;
198
199         WARN_ON(root->ref_cows && trans->transid !=
200                 root->fs_info->running_transaction->transid);
201         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
202
203         level = btrfs_header_level(buf);
204         if (level == 0)
205                 btrfs_item_key(buf, &disk_key, 0);
206         else
207                 btrfs_node_key(buf, &disk_key, 0);
208
209         cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
210                                      new_root_objectid, &disk_key, level,
211                                      buf->start, 0);
212         if (IS_ERR(cow))
213                 return PTR_ERR(cow);
214
215         copy_extent_buffer(cow, buf, 0, 0, cow->len);
216         btrfs_set_header_bytenr(cow, cow->start);
217         btrfs_set_header_generation(cow, trans->transid);
218         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
219         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
220                                      BTRFS_HEADER_FLAG_RELOC);
221         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
222                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
223         else
224                 btrfs_set_header_owner(cow, new_root_objectid);
225
226         write_extent_buffer(cow, root->fs_info->fsid,
227                             (unsigned long)btrfs_header_fsid(cow),
228                             BTRFS_FSID_SIZE);
229
230         WARN_ON(btrfs_header_generation(buf) > trans->transid);
231         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
232                 ret = btrfs_inc_ref(trans, root, cow, 1);
233         else
234                 ret = btrfs_inc_ref(trans, root, cow, 0);
235
236         if (ret)
237                 return ret;
238
239         btrfs_mark_buffer_dirty(cow);
240         *cow_ret = cow;
241         return 0;
242 }
243
244 /*
245  * check if the tree block can be shared by multiple trees
246  */
247 int btrfs_block_can_be_shared(struct btrfs_root *root,
248                               struct extent_buffer *buf)
249 {
250         /*
251          * Tree blocks not in refernece counted trees and tree roots
252          * are never shared. If a block was allocated after the last
253          * snapshot and the block was not allocated by tree relocation,
254          * we know the block is not shared.
255          */
256         if (root->ref_cows &&
257             buf != root->node && buf != root->commit_root &&
258             (btrfs_header_generation(buf) <=
259              btrfs_root_last_snapshot(&root->root_item) ||
260              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
261                 return 1;
262 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
263         if (root->ref_cows &&
264             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
265                 return 1;
266 #endif
267         return 0;
268 }
269
270 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
271                                        struct btrfs_root *root,
272                                        struct extent_buffer *buf,
273                                        struct extent_buffer *cow,
274                                        int *last_ref)
275 {
276         u64 refs;
277         u64 owner;
278         u64 flags;
279         u64 new_flags = 0;
280         int ret;
281
282         /*
283          * Backrefs update rules:
284          *
285          * Always use full backrefs for extent pointers in tree block
286          * allocated by tree relocation.
287          *
288          * If a shared tree block is no longer referenced by its owner
289          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
290          * use full backrefs for extent pointers in tree block.
291          *
292          * If a tree block is been relocating
293          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
294          * use full backrefs for extent pointers in tree block.
295          * The reason for this is some operations (such as drop tree)
296          * are only allowed for blocks use full backrefs.
297          */
298
299         if (btrfs_block_can_be_shared(root, buf)) {
300                 ret = btrfs_lookup_extent_info(trans, root, buf->start,
301                                                buf->len, &refs, &flags);
302                 BUG_ON(ret);
303                 BUG_ON(refs == 0);
304         } else {
305                 refs = 1;
306                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
307                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
308                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
309                 else
310                         flags = 0;
311         }
312
313         owner = btrfs_header_owner(buf);
314         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
315                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
316
317         if (refs > 1) {
318                 if ((owner == root->root_key.objectid ||
319                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
320                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
321                         ret = btrfs_inc_ref(trans, root, buf, 1);
322                         BUG_ON(ret);
323
324                         if (root->root_key.objectid ==
325                             BTRFS_TREE_RELOC_OBJECTID) {
326                                 ret = btrfs_dec_ref(trans, root, buf, 0);
327                                 BUG_ON(ret);
328                                 ret = btrfs_inc_ref(trans, root, cow, 1);
329                                 BUG_ON(ret);
330                         }
331                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
332                 } else {
333
334                         if (root->root_key.objectid ==
335                             BTRFS_TREE_RELOC_OBJECTID)
336                                 ret = btrfs_inc_ref(trans, root, cow, 1);
337                         else
338                                 ret = btrfs_inc_ref(trans, root, cow, 0);
339                         BUG_ON(ret);
340                 }
341                 if (new_flags != 0) {
342                         ret = btrfs_set_disk_extent_flags(trans, root,
343                                                           buf->start,
344                                                           buf->len,
345                                                           new_flags, 0);
346                         BUG_ON(ret);
347                 }
348         } else {
349                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
350                         if (root->root_key.objectid ==
351                             BTRFS_TREE_RELOC_OBJECTID)
352                                 ret = btrfs_inc_ref(trans, root, cow, 1);
353                         else
354                                 ret = btrfs_inc_ref(trans, root, cow, 0);
355                         BUG_ON(ret);
356                         ret = btrfs_dec_ref(trans, root, buf, 1);
357                         BUG_ON(ret);
358                 }
359                 clean_tree_block(trans, root, buf);
360                 *last_ref = 1;
361         }
362         return 0;
363 }
364
365 /*
366  * does the dirty work in cow of a single block.  The parent block (if
367  * supplied) is updated to point to the new cow copy.  The new buffer is marked
368  * dirty and returned locked.  If you modify the block it needs to be marked
369  * dirty again.
370  *
371  * search_start -- an allocation hint for the new block
372  *
373  * empty_size -- a hint that you plan on doing more cow.  This is the size in
374  * bytes the allocator should try to find free next to the block it returns.
375  * This is just a hint and may be ignored by the allocator.
376  */
377 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
378                              struct btrfs_root *root,
379                              struct extent_buffer *buf,
380                              struct extent_buffer *parent, int parent_slot,
381                              struct extent_buffer **cow_ret,
382                              u64 search_start, u64 empty_size)
383 {
384         struct btrfs_disk_key disk_key;
385         struct extent_buffer *cow;
386         int level;
387         int last_ref = 0;
388         int unlock_orig = 0;
389         u64 parent_start;
390
391         if (*cow_ret == buf)
392                 unlock_orig = 1;
393
394         btrfs_assert_tree_locked(buf);
395
396         WARN_ON(root->ref_cows && trans->transid !=
397                 root->fs_info->running_transaction->transid);
398         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
399
400         level = btrfs_header_level(buf);
401
402         if (level == 0)
403                 btrfs_item_key(buf, &disk_key, 0);
404         else
405                 btrfs_node_key(buf, &disk_key, 0);
406
407         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
408                 if (parent)
409                         parent_start = parent->start;
410                 else
411                         parent_start = 0;
412         } else
413                 parent_start = 0;
414
415         cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
416                                      root->root_key.objectid, &disk_key,
417                                      level, search_start, empty_size);
418         if (IS_ERR(cow))
419                 return PTR_ERR(cow);
420
421         /* cow is set to blocking by btrfs_init_new_buffer */
422
423         copy_extent_buffer(cow, buf, 0, 0, cow->len);
424         btrfs_set_header_bytenr(cow, cow->start);
425         btrfs_set_header_generation(cow, trans->transid);
426         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
427         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
428                                      BTRFS_HEADER_FLAG_RELOC);
429         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
430                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
431         else
432                 btrfs_set_header_owner(cow, root->root_key.objectid);
433
434         write_extent_buffer(cow, root->fs_info->fsid,
435                             (unsigned long)btrfs_header_fsid(cow),
436                             BTRFS_FSID_SIZE);
437
438         update_ref_for_cow(trans, root, buf, cow, &last_ref);
439
440         if (root->ref_cows)
441                 btrfs_reloc_cow_block(trans, root, buf, cow);
442
443         if (buf == root->node) {
444                 WARN_ON(parent && parent != buf);
445                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
446                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
447                         parent_start = buf->start;
448                 else
449                         parent_start = 0;
450
451                 extent_buffer_get(cow);
452                 rcu_assign_pointer(root->node, cow);
453
454                 btrfs_free_tree_block(trans, root, buf, parent_start,
455                                       last_ref);
456                 free_extent_buffer(buf);
457                 add_root_to_dirty_list(root);
458         } else {
459                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
460                         parent_start = parent->start;
461                 else
462                         parent_start = 0;
463
464                 WARN_ON(trans->transid != btrfs_header_generation(parent));
465                 btrfs_set_node_blockptr(parent, parent_slot,
466                                         cow->start);
467                 btrfs_set_node_ptr_generation(parent, parent_slot,
468                                               trans->transid);
469                 btrfs_mark_buffer_dirty(parent);
470                 btrfs_free_tree_block(trans, root, buf, parent_start,
471                                       last_ref);
472         }
473         if (unlock_orig)
474                 btrfs_tree_unlock(buf);
475         free_extent_buffer(buf);
476         btrfs_mark_buffer_dirty(cow);
477         *cow_ret = cow;
478         return 0;
479 }
480
481 static inline int should_cow_block(struct btrfs_trans_handle *trans,
482                                    struct btrfs_root *root,
483                                    struct extent_buffer *buf)
484 {
485         if (btrfs_header_generation(buf) == trans->transid &&
486             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
487             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
488               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
489                 return 0;
490         return 1;
491 }
492
493 /*
494  * cows a single block, see __btrfs_cow_block for the real work.
495  * This version of it has extra checks so that a block isn't cow'd more than
496  * once per transaction, as long as it hasn't been written yet
497  */
498 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
499                     struct btrfs_root *root, struct extent_buffer *buf,
500                     struct extent_buffer *parent, int parent_slot,
501                     struct extent_buffer **cow_ret)
502 {
503         u64 search_start;
504         int ret;
505
506         if (trans->transaction != root->fs_info->running_transaction) {
507                 printk(KERN_CRIT "trans %llu running %llu\n",
508                        (unsigned long long)trans->transid,
509                        (unsigned long long)
510                        root->fs_info->running_transaction->transid);
511                 WARN_ON(1);
512         }
513         if (trans->transid != root->fs_info->generation) {
514                 printk(KERN_CRIT "trans %llu running %llu\n",
515                        (unsigned long long)trans->transid,
516                        (unsigned long long)root->fs_info->generation);
517                 WARN_ON(1);
518         }
519
520         if (!should_cow_block(trans, root, buf)) {
521                 *cow_ret = buf;
522                 return 0;
523         }
524
525         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
526
527         if (parent)
528                 btrfs_set_lock_blocking(parent);
529         btrfs_set_lock_blocking(buf);
530
531         ret = __btrfs_cow_block(trans, root, buf, parent,
532                                  parent_slot, cow_ret, search_start, 0);
533
534         trace_btrfs_cow_block(root, buf, *cow_ret);
535
536         return ret;
537 }
538
539 /*
540  * helper function for defrag to decide if two blocks pointed to by a
541  * node are actually close by
542  */
543 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
544 {
545         if (blocknr < other && other - (blocknr + blocksize) < 32768)
546                 return 1;
547         if (blocknr > other && blocknr - (other + blocksize) < 32768)
548                 return 1;
549         return 0;
550 }
551
552 /*
553  * compare two keys in a memcmp fashion
554  */
555 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
556 {
557         struct btrfs_key k1;
558
559         btrfs_disk_key_to_cpu(&k1, disk);
560
561         return btrfs_comp_cpu_keys(&k1, k2);
562 }
563
564 /*
565  * same as comp_keys only with two btrfs_key's
566  */
567 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
568 {
569         if (k1->objectid > k2->objectid)
570                 return 1;
571         if (k1->objectid < k2->objectid)
572                 return -1;
573         if (k1->type > k2->type)
574                 return 1;
575         if (k1->type < k2->type)
576                 return -1;
577         if (k1->offset > k2->offset)
578                 return 1;
579         if (k1->offset < k2->offset)
580                 return -1;
581         return 0;
582 }
583
584 /*
585  * this is used by the defrag code to go through all the
586  * leaves pointed to by a node and reallocate them so that
587  * disk order is close to key order
588  */
589 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
590                        struct btrfs_root *root, struct extent_buffer *parent,
591                        int start_slot, int cache_only, u64 *last_ret,
592                        struct btrfs_key *progress)
593 {
594         struct extent_buffer *cur;
595         u64 blocknr;
596         u64 gen;
597         u64 search_start = *last_ret;
598         u64 last_block = 0;
599         u64 other;
600         u32 parent_nritems;
601         int end_slot;
602         int i;
603         int err = 0;
604         int parent_level;
605         int uptodate;
606         u32 blocksize;
607         int progress_passed = 0;
608         struct btrfs_disk_key disk_key;
609
610         parent_level = btrfs_header_level(parent);
611         if (cache_only && parent_level != 1)
612                 return 0;
613
614         if (trans->transaction != root->fs_info->running_transaction)
615                 WARN_ON(1);
616         if (trans->transid != root->fs_info->generation)
617                 WARN_ON(1);
618
619         parent_nritems = btrfs_header_nritems(parent);
620         blocksize = btrfs_level_size(root, parent_level - 1);
621         end_slot = parent_nritems;
622
623         if (parent_nritems == 1)
624                 return 0;
625
626         btrfs_set_lock_blocking(parent);
627
628         for (i = start_slot; i < end_slot; i++) {
629                 int close = 1;
630
631                 if (!parent->map_token) {
632                         map_extent_buffer(parent,
633                                         btrfs_node_key_ptr_offset(i),
634                                         sizeof(struct btrfs_key_ptr),
635                                         &parent->map_token, &parent->kaddr,
636                                         &parent->map_start, &parent->map_len,
637                                         KM_USER1);
638                 }
639                 btrfs_node_key(parent, &disk_key, i);
640                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
641                         continue;
642
643                 progress_passed = 1;
644                 blocknr = btrfs_node_blockptr(parent, i);
645                 gen = btrfs_node_ptr_generation(parent, i);
646                 if (last_block == 0)
647                         last_block = blocknr;
648
649                 if (i > 0) {
650                         other = btrfs_node_blockptr(parent, i - 1);
651                         close = close_blocks(blocknr, other, blocksize);
652                 }
653                 if (!close && i < end_slot - 2) {
654                         other = btrfs_node_blockptr(parent, i + 1);
655                         close = close_blocks(blocknr, other, blocksize);
656                 }
657                 if (close) {
658                         last_block = blocknr;
659                         continue;
660                 }
661                 if (parent->map_token) {
662                         unmap_extent_buffer(parent, parent->map_token,
663                                             KM_USER1);
664                         parent->map_token = NULL;
665                 }
666
667                 cur = btrfs_find_tree_block(root, blocknr, blocksize);
668                 if (cur)
669                         uptodate = btrfs_buffer_uptodate(cur, gen);
670                 else
671                         uptodate = 0;
672                 if (!cur || !uptodate) {
673                         if (cache_only) {
674                                 free_extent_buffer(cur);
675                                 continue;
676                         }
677                         if (!cur) {
678                                 cur = read_tree_block(root, blocknr,
679                                                          blocksize, gen);
680                                 if (!cur)
681                                         return -EIO;
682                         } else if (!uptodate) {
683                                 btrfs_read_buffer(cur, gen);
684                         }
685                 }
686                 if (search_start == 0)
687                         search_start = last_block;
688
689                 btrfs_tree_lock(cur);
690                 btrfs_set_lock_blocking(cur);
691                 err = __btrfs_cow_block(trans, root, cur, parent, i,
692                                         &cur, search_start,
693                                         min(16 * blocksize,
694                                             (end_slot - i) * blocksize));
695                 if (err) {
696                         btrfs_tree_unlock(cur);
697                         free_extent_buffer(cur);
698                         break;
699                 }
700                 search_start = cur->start;
701                 last_block = cur->start;
702                 *last_ret = search_start;
703                 btrfs_tree_unlock(cur);
704                 free_extent_buffer(cur);
705         }
706         if (parent->map_token) {
707                 unmap_extent_buffer(parent, parent->map_token,
708                                     KM_USER1);
709                 parent->map_token = NULL;
710         }
711         return err;
712 }
713
714 /*
715  * The leaf data grows from end-to-front in the node.
716  * this returns the address of the start of the last item,
717  * which is the stop of the leaf data stack
718  */
719 static inline unsigned int leaf_data_end(struct btrfs_root *root,
720                                          struct extent_buffer *leaf)
721 {
722         u32 nr = btrfs_header_nritems(leaf);
723         if (nr == 0)
724                 return BTRFS_LEAF_DATA_SIZE(root);
725         return btrfs_item_offset_nr(leaf, nr - 1);
726 }
727
728
729 /*
730  * search for key in the extent_buffer.  The items start at offset p,
731  * and they are item_size apart.  There are 'max' items in p.
732  *
733  * the slot in the array is returned via slot, and it points to
734  * the place where you would insert key if it is not found in
735  * the array.
736  *
737  * slot may point to max if the key is bigger than all of the keys
738  */
739 static noinline int generic_bin_search(struct extent_buffer *eb,
740                                        unsigned long p,
741                                        int item_size, struct btrfs_key *key,
742                                        int max, int *slot)
743 {
744         int low = 0;
745         int high = max;
746         int mid;
747         int ret;
748         struct btrfs_disk_key *tmp = NULL;
749         struct btrfs_disk_key unaligned;
750         unsigned long offset;
751         char *map_token = NULL;
752         char *kaddr = NULL;
753         unsigned long map_start = 0;
754         unsigned long map_len = 0;
755         int err;
756
757         while (low < high) {
758                 mid = (low + high) / 2;
759                 offset = p + mid * item_size;
760
761                 if (!map_token || offset < map_start ||
762                     (offset + sizeof(struct btrfs_disk_key)) >
763                     map_start + map_len) {
764                         if (map_token) {
765                                 unmap_extent_buffer(eb, map_token, KM_USER0);
766                                 map_token = NULL;
767                         }
768
769                         err = map_private_extent_buffer(eb, offset,
770                                                 sizeof(struct btrfs_disk_key),
771                                                 &map_token, &kaddr,
772                                                 &map_start, &map_len, KM_USER0);
773
774                         if (!err) {
775                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
776                                                         map_start);
777                         } else {
778                                 read_extent_buffer(eb, &unaligned,
779                                                    offset, sizeof(unaligned));
780                                 tmp = &unaligned;
781                         }
782
783                 } else {
784                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
785                                                         map_start);
786                 }
787                 ret = comp_keys(tmp, key);
788
789                 if (ret < 0)
790                         low = mid + 1;
791                 else if (ret > 0)
792                         high = mid;
793                 else {
794                         *slot = mid;
795                         if (map_token)
796                                 unmap_extent_buffer(eb, map_token, KM_USER0);
797                         return 0;
798                 }
799         }
800         *slot = low;
801         if (map_token)
802                 unmap_extent_buffer(eb, map_token, KM_USER0);
803         return 1;
804 }
805
806 /*
807  * simple bin_search frontend that does the right thing for
808  * leaves vs nodes
809  */
810 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
811                       int level, int *slot)
812 {
813         if (level == 0) {
814                 return generic_bin_search(eb,
815                                           offsetof(struct btrfs_leaf, items),
816                                           sizeof(struct btrfs_item),
817                                           key, btrfs_header_nritems(eb),
818                                           slot);
819         } else {
820                 return generic_bin_search(eb,
821                                           offsetof(struct btrfs_node, ptrs),
822                                           sizeof(struct btrfs_key_ptr),
823                                           key, btrfs_header_nritems(eb),
824                                           slot);
825         }
826         return -1;
827 }
828
829 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
830                      int level, int *slot)
831 {
832         return bin_search(eb, key, level, slot);
833 }
834
835 static void root_add_used(struct btrfs_root *root, u32 size)
836 {
837         spin_lock(&root->accounting_lock);
838         btrfs_set_root_used(&root->root_item,
839                             btrfs_root_used(&root->root_item) + size);
840         spin_unlock(&root->accounting_lock);
841 }
842
843 static void root_sub_used(struct btrfs_root *root, u32 size)
844 {
845         spin_lock(&root->accounting_lock);
846         btrfs_set_root_used(&root->root_item,
847                             btrfs_root_used(&root->root_item) - size);
848         spin_unlock(&root->accounting_lock);
849 }
850
851 /* given a node and slot number, this reads the blocks it points to.  The
852  * extent buffer is returned with a reference taken (but unlocked).
853  * NULL is returned on error.
854  */
855 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
856                                    struct extent_buffer *parent, int slot)
857 {
858         int level = btrfs_header_level(parent);
859         if (slot < 0)
860                 return NULL;
861         if (slot >= btrfs_header_nritems(parent))
862                 return NULL;
863
864         BUG_ON(level == 0);
865
866         return read_tree_block(root, btrfs_node_blockptr(parent, slot),
867                        btrfs_level_size(root, level - 1),
868                        btrfs_node_ptr_generation(parent, slot));
869 }
870
871 /*
872  * node level balancing, used to make sure nodes are in proper order for
873  * item deletion.  We balance from the top down, so we have to make sure
874  * that a deletion won't leave an node completely empty later on.
875  */
876 static noinline int balance_level(struct btrfs_trans_handle *trans,
877                          struct btrfs_root *root,
878                          struct btrfs_path *path, int level)
879 {
880         struct extent_buffer *right = NULL;
881         struct extent_buffer *mid;
882         struct extent_buffer *left = NULL;
883         struct extent_buffer *parent = NULL;
884         int ret = 0;
885         int wret;
886         int pslot;
887         int orig_slot = path->slots[level];
888         u64 orig_ptr;
889
890         if (level == 0)
891                 return 0;
892
893         mid = path->nodes[level];
894
895         WARN_ON(!path->locks[level]);
896         WARN_ON(btrfs_header_generation(mid) != trans->transid);
897
898         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
899
900         if (level < BTRFS_MAX_LEVEL - 1)
901                 parent = path->nodes[level + 1];
902         pslot = path->slots[level + 1];
903
904         /*
905          * deal with the case where there is only one pointer in the root
906          * by promoting the node below to a root
907          */
908         if (!parent) {
909                 struct extent_buffer *child;
910
911                 if (btrfs_header_nritems(mid) != 1)
912                         return 0;
913
914                 /* promote the child to a root */
915                 child = read_node_slot(root, mid, 0);
916                 BUG_ON(!child);
917                 btrfs_tree_lock(child);
918                 btrfs_set_lock_blocking(child);
919                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
920                 if (ret) {
921                         btrfs_tree_unlock(child);
922                         free_extent_buffer(child);
923                         goto enospc;
924                 }
925
926                 rcu_assign_pointer(root->node, child);
927
928                 add_root_to_dirty_list(root);
929                 btrfs_tree_unlock(child);
930
931                 path->locks[level] = 0;
932                 path->nodes[level] = NULL;
933                 clean_tree_block(trans, root, mid);
934                 btrfs_tree_unlock(mid);
935                 /* once for the path */
936                 free_extent_buffer(mid);
937
938                 root_sub_used(root, mid->len);
939                 btrfs_free_tree_block(trans, root, mid, 0, 1);
940                 /* once for the root ptr */
941                 free_extent_buffer(mid);
942                 return 0;
943         }
944         if (btrfs_header_nritems(mid) >
945             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
946                 return 0;
947
948         btrfs_header_nritems(mid);
949
950         left = read_node_slot(root, parent, pslot - 1);
951         if (left) {
952                 btrfs_tree_lock(left);
953                 btrfs_set_lock_blocking(left);
954                 wret = btrfs_cow_block(trans, root, left,
955                                        parent, pslot - 1, &left);
956                 if (wret) {
957                         ret = wret;
958                         goto enospc;
959                 }
960         }
961         right = read_node_slot(root, parent, pslot + 1);
962         if (right) {
963                 btrfs_tree_lock(right);
964                 btrfs_set_lock_blocking(right);
965                 wret = btrfs_cow_block(trans, root, right,
966                                        parent, pslot + 1, &right);
967                 if (wret) {
968                         ret = wret;
969                         goto enospc;
970                 }
971         }
972
973         /* first, try to make some room in the middle buffer */
974         if (left) {
975                 orig_slot += btrfs_header_nritems(left);
976                 wret = push_node_left(trans, root, left, mid, 1);
977                 if (wret < 0)
978                         ret = wret;
979                 btrfs_header_nritems(mid);
980         }
981
982         /*
983          * then try to empty the right most buffer into the middle
984          */
985         if (right) {
986                 wret = push_node_left(trans, root, mid, right, 1);
987                 if (wret < 0 && wret != -ENOSPC)
988                         ret = wret;
989                 if (btrfs_header_nritems(right) == 0) {
990                         clean_tree_block(trans, root, right);
991                         btrfs_tree_unlock(right);
992                         wret = del_ptr(trans, root, path, level + 1, pslot +
993                                        1);
994                         if (wret)
995                                 ret = wret;
996                         root_sub_used(root, right->len);
997                         btrfs_free_tree_block(trans, root, right, 0, 1);
998                         free_extent_buffer(right);
999                         right = NULL;
1000                 } else {
1001                         struct btrfs_disk_key right_key;
1002                         btrfs_node_key(right, &right_key, 0);
1003                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1004                         btrfs_mark_buffer_dirty(parent);
1005                 }
1006         }
1007         if (btrfs_header_nritems(mid) == 1) {
1008                 /*
1009                  * we're not allowed to leave a node with one item in the
1010                  * tree during a delete.  A deletion from lower in the tree
1011                  * could try to delete the only pointer in this node.
1012                  * So, pull some keys from the left.
1013                  * There has to be a left pointer at this point because
1014                  * otherwise we would have pulled some pointers from the
1015                  * right
1016                  */
1017                 BUG_ON(!left);
1018                 wret = balance_node_right(trans, root, mid, left);
1019                 if (wret < 0) {
1020                         ret = wret;
1021                         goto enospc;
1022                 }
1023                 if (wret == 1) {
1024                         wret = push_node_left(trans, root, left, mid, 1);
1025                         if (wret < 0)
1026                                 ret = wret;
1027                 }
1028                 BUG_ON(wret == 1);
1029         }
1030         if (btrfs_header_nritems(mid) == 0) {
1031                 clean_tree_block(trans, root, mid);
1032                 btrfs_tree_unlock(mid);
1033                 wret = del_ptr(trans, root, path, level + 1, pslot);
1034                 if (wret)
1035                         ret = wret;
1036                 root_sub_used(root, mid->len);
1037                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1038                 free_extent_buffer(mid);
1039                 mid = NULL;
1040         } else {
1041                 /* update the parent key to reflect our changes */
1042                 struct btrfs_disk_key mid_key;
1043                 btrfs_node_key(mid, &mid_key, 0);
1044                 btrfs_set_node_key(parent, &mid_key, pslot);
1045                 btrfs_mark_buffer_dirty(parent);
1046         }
1047
1048         /* update the path */
1049         if (left) {
1050                 if (btrfs_header_nritems(left) > orig_slot) {
1051                         extent_buffer_get(left);
1052                         /* left was locked after cow */
1053                         path->nodes[level] = left;
1054                         path->slots[level + 1] -= 1;
1055                         path->slots[level] = orig_slot;
1056                         if (mid) {
1057                                 btrfs_tree_unlock(mid);
1058                                 free_extent_buffer(mid);
1059                         }
1060                 } else {
1061                         orig_slot -= btrfs_header_nritems(left);
1062                         path->slots[level] = orig_slot;
1063                 }
1064         }
1065         /* double check we haven't messed things up */
1066         if (orig_ptr !=
1067             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1068                 BUG();
1069 enospc:
1070         if (right) {
1071                 btrfs_tree_unlock(right);
1072                 free_extent_buffer(right);
1073         }
1074         if (left) {
1075                 if (path->nodes[level] != left)
1076                         btrfs_tree_unlock(left);
1077                 free_extent_buffer(left);
1078         }
1079         return ret;
1080 }
1081
1082 /* Node balancing for insertion.  Here we only split or push nodes around
1083  * when they are completely full.  This is also done top down, so we
1084  * have to be pessimistic.
1085  */
1086 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1087                                           struct btrfs_root *root,
1088                                           struct btrfs_path *path, int level)
1089 {
1090         struct extent_buffer *right = NULL;
1091         struct extent_buffer *mid;
1092         struct extent_buffer *left = NULL;
1093         struct extent_buffer *parent = NULL;
1094         int ret = 0;
1095         int wret;
1096         int pslot;
1097         int orig_slot = path->slots[level];
1098
1099         if (level == 0)
1100                 return 1;
1101
1102         mid = path->nodes[level];
1103         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1104
1105         if (level < BTRFS_MAX_LEVEL - 1)
1106                 parent = path->nodes[level + 1];
1107         pslot = path->slots[level + 1];
1108
1109         if (!parent)
1110                 return 1;
1111
1112         left = read_node_slot(root, parent, pslot - 1);
1113
1114         /* first, try to make some room in the middle buffer */
1115         if (left) {
1116                 u32 left_nr;
1117
1118                 btrfs_tree_lock(left);
1119                 btrfs_set_lock_blocking(left);
1120
1121                 left_nr = btrfs_header_nritems(left);
1122                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1123                         wret = 1;
1124                 } else {
1125                         ret = btrfs_cow_block(trans, root, left, parent,
1126                                               pslot - 1, &left);
1127                         if (ret)
1128                                 wret = 1;
1129                         else {
1130                                 wret = push_node_left(trans, root,
1131                                                       left, mid, 0);
1132                         }
1133                 }
1134                 if (wret < 0)
1135                         ret = wret;
1136                 if (wret == 0) {
1137                         struct btrfs_disk_key disk_key;
1138                         orig_slot += left_nr;
1139                         btrfs_node_key(mid, &disk_key, 0);
1140                         btrfs_set_node_key(parent, &disk_key, pslot);
1141                         btrfs_mark_buffer_dirty(parent);
1142                         if (btrfs_header_nritems(left) > orig_slot) {
1143                                 path->nodes[level] = left;
1144                                 path->slots[level + 1] -= 1;
1145                                 path->slots[level] = orig_slot;
1146                                 btrfs_tree_unlock(mid);
1147                                 free_extent_buffer(mid);
1148                         } else {
1149                                 orig_slot -=
1150                                         btrfs_header_nritems(left);
1151                                 path->slots[level] = orig_slot;
1152                                 btrfs_tree_unlock(left);
1153                                 free_extent_buffer(left);
1154                         }
1155                         return 0;
1156                 }
1157                 btrfs_tree_unlock(left);
1158                 free_extent_buffer(left);
1159         }
1160         right = read_node_slot(root, parent, pslot + 1);
1161
1162         /*
1163          * then try to empty the right most buffer into the middle
1164          */
1165         if (right) {
1166                 u32 right_nr;
1167
1168                 btrfs_tree_lock(right);
1169                 btrfs_set_lock_blocking(right);
1170
1171                 right_nr = btrfs_header_nritems(right);
1172                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1173                         wret = 1;
1174                 } else {
1175                         ret = btrfs_cow_block(trans, root, right,
1176                                               parent, pslot + 1,
1177                                               &right);
1178                         if (ret)
1179                                 wret = 1;
1180                         else {
1181                                 wret = balance_node_right(trans, root,
1182                                                           right, mid);
1183                         }
1184                 }
1185                 if (wret < 0)
1186                         ret = wret;
1187                 if (wret == 0) {
1188                         struct btrfs_disk_key disk_key;
1189
1190                         btrfs_node_key(right, &disk_key, 0);
1191                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1192                         btrfs_mark_buffer_dirty(parent);
1193
1194                         if (btrfs_header_nritems(mid) <= orig_slot) {
1195                                 path->nodes[level] = right;
1196                                 path->slots[level + 1] += 1;
1197                                 path->slots[level] = orig_slot -
1198                                         btrfs_header_nritems(mid);
1199                                 btrfs_tree_unlock(mid);
1200                                 free_extent_buffer(mid);
1201                         } else {
1202                                 btrfs_tree_unlock(right);
1203                                 free_extent_buffer(right);
1204                         }
1205                         return 0;
1206                 }
1207                 btrfs_tree_unlock(right);
1208                 free_extent_buffer(right);
1209         }
1210         return 1;
1211 }
1212
1213 /*
1214  * readahead one full node of leaves, finding things that are close
1215  * to the block in 'slot', and triggering ra on them.
1216  */
1217 static void reada_for_search(struct btrfs_root *root,
1218                              struct btrfs_path *path,
1219                              int level, int slot, u64 objectid)
1220 {
1221         struct extent_buffer *node;
1222         struct btrfs_disk_key disk_key;
1223         u32 nritems;
1224         u64 search;
1225         u64 target;
1226         u64 nread = 0;
1227         int direction = path->reada;
1228         struct extent_buffer *eb;
1229         u32 nr;
1230         u32 blocksize;
1231         u32 nscan = 0;
1232
1233         if (level != 1)
1234                 return;
1235
1236         if (!path->nodes[level])
1237                 return;
1238
1239         node = path->nodes[level];
1240
1241         search = btrfs_node_blockptr(node, slot);
1242         blocksize = btrfs_level_size(root, level - 1);
1243         eb = btrfs_find_tree_block(root, search, blocksize);
1244         if (eb) {
1245                 free_extent_buffer(eb);
1246                 return;
1247         }
1248
1249         target = search;
1250
1251         nritems = btrfs_header_nritems(node);
1252         nr = slot;
1253         while (1) {
1254                 if (direction < 0) {
1255                         if (nr == 0)
1256                                 break;
1257                         nr--;
1258                 } else if (direction > 0) {
1259                         nr++;
1260                         if (nr >= nritems)
1261                                 break;
1262                 }
1263                 if (path->reada < 0 && objectid) {
1264                         btrfs_node_key(node, &disk_key, nr);
1265                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1266                                 break;
1267                 }
1268                 search = btrfs_node_blockptr(node, nr);
1269                 if ((search <= target && target - search <= 65536) ||
1270                     (search > target && search - target <= 65536)) {
1271                         readahead_tree_block(root, search, blocksize,
1272                                      btrfs_node_ptr_generation(node, nr));
1273                         nread += blocksize;
1274                 }
1275                 nscan++;
1276                 if ((nread > 65536 || nscan > 32))
1277                         break;
1278         }
1279 }
1280
1281 /*
1282  * returns -EAGAIN if it had to drop the path, or zero if everything was in
1283  * cache
1284  */
1285 static noinline int reada_for_balance(struct btrfs_root *root,
1286                                       struct btrfs_path *path, int level)
1287 {
1288         int slot;
1289         int nritems;
1290         struct extent_buffer *parent;
1291         struct extent_buffer *eb;
1292         u64 gen;
1293         u64 block1 = 0;
1294         u64 block2 = 0;
1295         int ret = 0;
1296         int blocksize;
1297
1298         parent = path->nodes[level + 1];
1299         if (!parent)
1300                 return 0;
1301
1302         nritems = btrfs_header_nritems(parent);
1303         slot = path->slots[level + 1];
1304         blocksize = btrfs_level_size(root, level);
1305
1306         if (slot > 0) {
1307                 block1 = btrfs_node_blockptr(parent, slot - 1);
1308                 gen = btrfs_node_ptr_generation(parent, slot - 1);
1309                 eb = btrfs_find_tree_block(root, block1, blocksize);
1310                 if (eb && btrfs_buffer_uptodate(eb, gen))
1311                         block1 = 0;
1312                 free_extent_buffer(eb);
1313         }
1314         if (slot + 1 < nritems) {
1315                 block2 = btrfs_node_blockptr(parent, slot + 1);
1316                 gen = btrfs_node_ptr_generation(parent, slot + 1);
1317                 eb = btrfs_find_tree_block(root, block2, blocksize);
1318                 if (eb && btrfs_buffer_uptodate(eb, gen))
1319                         block2 = 0;
1320                 free_extent_buffer(eb);
1321         }
1322         if (block1 || block2) {
1323                 ret = -EAGAIN;
1324
1325                 /* release the whole path */
1326                 btrfs_release_path(path);
1327
1328                 /* read the blocks */
1329                 if (block1)
1330                         readahead_tree_block(root, block1, blocksize, 0);
1331                 if (block2)
1332                         readahead_tree_block(root, block2, blocksize, 0);
1333
1334                 if (block1) {
1335                         eb = read_tree_block(root, block1, blocksize, 0);
1336                         free_extent_buffer(eb);
1337                 }
1338                 if (block2) {
1339                         eb = read_tree_block(root, block2, blocksize, 0);
1340                         free_extent_buffer(eb);
1341                 }
1342         }
1343         return ret;
1344 }
1345
1346
1347 /*
1348  * when we walk down the tree, it is usually safe to unlock the higher layers
1349  * in the tree.  The exceptions are when our path goes through slot 0, because
1350  * operations on the tree might require changing key pointers higher up in the
1351  * tree.
1352  *
1353  * callers might also have set path->keep_locks, which tells this code to keep
1354  * the lock if the path points to the last slot in the block.  This is part of
1355  * walking through the tree, and selecting the next slot in the higher block.
1356  *
1357  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1358  * if lowest_unlock is 1, level 0 won't be unlocked
1359  */
1360 static noinline void unlock_up(struct btrfs_path *path, int level,
1361                                int lowest_unlock)
1362 {
1363         int i;
1364         int skip_level = level;
1365         int no_skips = 0;
1366         struct extent_buffer *t;
1367
1368         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1369                 if (!path->nodes[i])
1370                         break;
1371                 if (!path->locks[i])
1372                         break;
1373                 if (!no_skips && path->slots[i] == 0) {
1374                         skip_level = i + 1;
1375                         continue;
1376                 }
1377                 if (!no_skips && path->keep_locks) {
1378                         u32 nritems;
1379                         t = path->nodes[i];
1380                         nritems = btrfs_header_nritems(t);
1381                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
1382                                 skip_level = i + 1;
1383                                 continue;
1384                         }
1385                 }
1386                 if (skip_level < i && i >= lowest_unlock)
1387                         no_skips = 1;
1388
1389                 t = path->nodes[i];
1390                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1391                         btrfs_tree_unlock(t);
1392                         path->locks[i] = 0;
1393                 }
1394         }
1395 }
1396
1397 /*
1398  * This releases any locks held in the path starting at level and
1399  * going all the way up to the root.
1400  *
1401  * btrfs_search_slot will keep the lock held on higher nodes in a few
1402  * corner cases, such as COW of the block at slot zero in the node.  This
1403  * ignores those rules, and it should only be called when there are no
1404  * more updates to be done higher up in the tree.
1405  */
1406 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1407 {
1408         int i;
1409
1410         if (path->keep_locks)
1411                 return;
1412
1413         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1414                 if (!path->nodes[i])
1415                         continue;
1416                 if (!path->locks[i])
1417                         continue;
1418                 btrfs_tree_unlock(path->nodes[i]);
1419                 path->locks[i] = 0;
1420         }
1421 }
1422
1423 /*
1424  * helper function for btrfs_search_slot.  The goal is to find a block
1425  * in cache without setting the path to blocking.  If we find the block
1426  * we return zero and the path is unchanged.
1427  *
1428  * If we can't find the block, we set the path blocking and do some
1429  * reada.  -EAGAIN is returned and the search must be repeated.
1430  */
1431 static int
1432 read_block_for_search(struct btrfs_trans_handle *trans,
1433                        struct btrfs_root *root, struct btrfs_path *p,
1434                        struct extent_buffer **eb_ret, int level, int slot,
1435                        struct btrfs_key *key)
1436 {
1437         u64 blocknr;
1438         u64 gen;
1439         u32 blocksize;
1440         struct extent_buffer *b = *eb_ret;
1441         struct extent_buffer *tmp;
1442         int ret;
1443
1444         blocknr = btrfs_node_blockptr(b, slot);
1445         gen = btrfs_node_ptr_generation(b, slot);
1446         blocksize = btrfs_level_size(root, level - 1);
1447
1448         tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1449         if (tmp) {
1450                 if (btrfs_buffer_uptodate(tmp, 0)) {
1451                         if (btrfs_buffer_uptodate(tmp, gen)) {
1452                                 /*
1453                                  * we found an up to date block without
1454                                  * sleeping, return
1455                                  * right away
1456                                  */
1457                                 *eb_ret = tmp;
1458                                 return 0;
1459                         }
1460                         /* the pages were up to date, but we failed
1461                          * the generation number check.  Do a full
1462                          * read for the generation number that is correct.
1463                          * We must do this without dropping locks so
1464                          * we can trust our generation number
1465                          */
1466                         free_extent_buffer(tmp);
1467                         tmp = read_tree_block(root, blocknr, blocksize, gen);
1468                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1469                                 *eb_ret = tmp;
1470                                 return 0;
1471                         }
1472                         free_extent_buffer(tmp);
1473                         btrfs_release_path(p);
1474                         return -EIO;
1475                 }
1476         }
1477
1478         /*
1479          * reduce lock contention at high levels
1480          * of the btree by dropping locks before
1481          * we read.  Don't release the lock on the current
1482          * level because we need to walk this node to figure
1483          * out which blocks to read.
1484          */
1485         btrfs_unlock_up_safe(p, level + 1);
1486         btrfs_set_path_blocking(p);
1487
1488         free_extent_buffer(tmp);
1489         if (p->reada)
1490                 reada_for_search(root, p, level, slot, key->objectid);
1491
1492         btrfs_release_path(p);
1493
1494         ret = -EAGAIN;
1495         tmp = read_tree_block(root, blocknr, blocksize, 0);
1496         if (tmp) {
1497                 /*
1498                  * If the read above didn't mark this buffer up to date,
1499                  * it will never end up being up to date.  Set ret to EIO now
1500                  * and give up so that our caller doesn't loop forever
1501                  * on our EAGAINs.
1502                  */
1503                 if (!btrfs_buffer_uptodate(tmp, 0))
1504                         ret = -EIO;
1505                 free_extent_buffer(tmp);
1506         }
1507         return ret;
1508 }
1509
1510 /*
1511  * helper function for btrfs_search_slot.  This does all of the checks
1512  * for node-level blocks and does any balancing required based on
1513  * the ins_len.
1514  *
1515  * If no extra work was required, zero is returned.  If we had to
1516  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1517  * start over
1518  */
1519 static int
1520 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1521                        struct btrfs_root *root, struct btrfs_path *p,
1522                        struct extent_buffer *b, int level, int ins_len)
1523 {
1524         int ret;
1525         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1526             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1527                 int sret;
1528
1529                 sret = reada_for_balance(root, p, level);
1530                 if (sret)
1531                         goto again;
1532
1533                 btrfs_set_path_blocking(p);
1534                 sret = split_node(trans, root, p, level);
1535                 btrfs_clear_path_blocking(p, NULL);
1536
1537                 BUG_ON(sret > 0);
1538                 if (sret) {
1539                         ret = sret;
1540                         goto done;
1541                 }
1542                 b = p->nodes[level];
1543         } else if (ins_len < 0 && btrfs_header_nritems(b) <
1544                    BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1545                 int sret;
1546
1547                 sret = reada_for_balance(root, p, level);
1548                 if (sret)
1549                         goto again;
1550
1551                 btrfs_set_path_blocking(p);
1552                 sret = balance_level(trans, root, p, level);
1553                 btrfs_clear_path_blocking(p, NULL);
1554
1555                 if (sret) {
1556                         ret = sret;
1557                         goto done;
1558                 }
1559                 b = p->nodes[level];
1560                 if (!b) {
1561                         btrfs_release_path(p);
1562                         goto again;
1563                 }
1564                 BUG_ON(btrfs_header_nritems(b) == 1);
1565         }
1566         return 0;
1567
1568 again:
1569         ret = -EAGAIN;
1570 done:
1571         return ret;
1572 }
1573
1574 /*
1575  * look for key in the tree.  path is filled in with nodes along the way
1576  * if key is found, we return zero and you can find the item in the leaf
1577  * level of the path (level 0)
1578  *
1579  * If the key isn't found, the path points to the slot where it should
1580  * be inserted, and 1 is returned.  If there are other errors during the
1581  * search a negative error number is returned.
1582  *
1583  * if ins_len > 0, nodes and leaves will be split as we walk down the
1584  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
1585  * possible)
1586  */
1587 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1588                       *root, struct btrfs_key *key, struct btrfs_path *p, int
1589                       ins_len, int cow)
1590 {
1591         struct extent_buffer *b;
1592         int slot;
1593         int ret;
1594         int err;
1595         int level;
1596         int lowest_unlock = 1;
1597         u8 lowest_level = 0;
1598
1599         lowest_level = p->lowest_level;
1600         WARN_ON(lowest_level && ins_len > 0);
1601         WARN_ON(p->nodes[0] != NULL);
1602
1603         if (ins_len < 0)
1604                 lowest_unlock = 2;
1605
1606 again:
1607         if (p->search_commit_root) {
1608                 b = root->commit_root;
1609                 extent_buffer_get(b);
1610                 if (!p->skip_locking)
1611                         btrfs_tree_lock(b);
1612         } else {
1613                 if (p->skip_locking)
1614                         b = btrfs_root_node(root);
1615                 else
1616                         b = btrfs_lock_root_node(root);
1617         }
1618
1619         while (b) {
1620                 level = btrfs_header_level(b);
1621
1622                 /*
1623                  * setup the path here so we can release it under lock
1624                  * contention with the cow code
1625                  */
1626                 p->nodes[level] = b;
1627                 if (!p->skip_locking)
1628                         p->locks[level] = 1;
1629
1630                 if (cow) {
1631                         /*
1632                          * if we don't really need to cow this block
1633                          * then we don't want to set the path blocking,
1634                          * so we test it here
1635                          */
1636                         if (!should_cow_block(trans, root, b))
1637                                 goto cow_done;
1638
1639                         btrfs_set_path_blocking(p);
1640
1641                         err = btrfs_cow_block(trans, root, b,
1642                                               p->nodes[level + 1],
1643                                               p->slots[level + 1], &b);
1644                         if (err) {
1645                                 ret = err;
1646                                 goto done;
1647                         }
1648                 }
1649 cow_done:
1650                 BUG_ON(!cow && ins_len);
1651                 if (level != btrfs_header_level(b))
1652                         WARN_ON(1);
1653                 level = btrfs_header_level(b);
1654
1655                 p->nodes[level] = b;
1656                 if (!p->skip_locking)
1657                         p->locks[level] = 1;
1658
1659                 btrfs_clear_path_blocking(p, NULL);
1660
1661                 /*
1662                  * we have a lock on b and as long as we aren't changing
1663                  * the tree, there is no way to for the items in b to change.
1664                  * It is safe to drop the lock on our parent before we
1665                  * go through the expensive btree search on b.
1666                  *
1667                  * If cow is true, then we might be changing slot zero,
1668                  * which may require changing the parent.  So, we can't
1669                  * drop the lock until after we know which slot we're
1670                  * operating on.
1671                  */
1672                 if (!cow)
1673                         btrfs_unlock_up_safe(p, level + 1);
1674
1675                 ret = bin_search(b, key, level, &slot);
1676
1677                 if (level != 0) {
1678                         int dec = 0;
1679                         if (ret && slot > 0) {
1680                                 dec = 1;
1681                                 slot -= 1;
1682                         }
1683                         p->slots[level] = slot;
1684                         err = setup_nodes_for_search(trans, root, p, b, level,
1685                                                      ins_len);
1686                         if (err == -EAGAIN)
1687                                 goto again;
1688                         if (err) {
1689                                 ret = err;
1690                                 goto done;
1691                         }
1692                         b = p->nodes[level];
1693                         slot = p->slots[level];
1694
1695                         unlock_up(p, level, lowest_unlock);
1696
1697                         if (level == lowest_level) {
1698                                 if (dec)
1699                                         p->slots[level]++;
1700                                 goto done;
1701                         }
1702
1703                         err = read_block_for_search(trans, root, p,
1704                                                     &b, level, slot, key);
1705                         if (err == -EAGAIN)
1706                                 goto again;
1707                         if (err) {
1708                                 ret = err;
1709                                 goto done;
1710                         }
1711
1712                         if (!p->skip_locking) {
1713                                 btrfs_clear_path_blocking(p, NULL);
1714                                 err = btrfs_try_spin_lock(b);
1715
1716                                 if (!err) {
1717                                         btrfs_set_path_blocking(p);
1718                                         btrfs_tree_lock(b);
1719                                         btrfs_clear_path_blocking(p, b);
1720                                 }
1721                         }
1722                 } else {
1723                         p->slots[level] = slot;
1724                         if (ins_len > 0 &&
1725                             btrfs_leaf_free_space(root, b) < ins_len) {
1726                                 btrfs_set_path_blocking(p);
1727                                 err = split_leaf(trans, root, key,
1728                                                  p, ins_len, ret == 0);
1729                                 btrfs_clear_path_blocking(p, NULL);
1730
1731                                 BUG_ON(err > 0);
1732                                 if (err) {
1733                                         ret = err;
1734                                         goto done;
1735                                 }
1736                         }
1737                         if (!p->search_for_split)
1738                                 unlock_up(p, level, lowest_unlock);
1739                         goto done;
1740                 }
1741         }
1742         ret = 1;
1743 done:
1744         /*
1745          * we don't really know what they plan on doing with the path
1746          * from here on, so for now just mark it as blocking
1747          */
1748         if (!p->leave_spinning)
1749                 btrfs_set_path_blocking(p);
1750         if (ret < 0)
1751                 btrfs_release_path(p);
1752         return ret;
1753 }
1754
1755 /*
1756  * adjust the pointers going up the tree, starting at level
1757  * making sure the right key of each node is points to 'key'.
1758  * This is used after shifting pointers to the left, so it stops
1759  * fixing up pointers when a given leaf/node is not in slot 0 of the
1760  * higher levels
1761  *
1762  * If this fails to write a tree block, it returns -1, but continues
1763  * fixing up the blocks in ram so the tree is consistent.
1764  */
1765 static int fixup_low_keys(struct btrfs_trans_handle *trans,
1766                           struct btrfs_root *root, struct btrfs_path *path,
1767                           struct btrfs_disk_key *key, int level)
1768 {
1769         int i;
1770         int ret = 0;
1771         struct extent_buffer *t;
1772
1773         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1774                 int tslot = path->slots[i];
1775                 if (!path->nodes[i])
1776                         break;
1777                 t = path->nodes[i];
1778                 btrfs_set_node_key(t, key, tslot);
1779                 btrfs_mark_buffer_dirty(path->nodes[i]);
1780                 if (tslot != 0)
1781                         break;
1782         }
1783         return ret;
1784 }
1785
1786 /*
1787  * update item key.
1788  *
1789  * This function isn't completely safe. It's the caller's responsibility
1790  * that the new key won't break the order
1791  */
1792 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1793                             struct btrfs_root *root, struct btrfs_path *path,
1794                             struct btrfs_key *new_key)
1795 {
1796         struct btrfs_disk_key disk_key;
1797         struct extent_buffer *eb;
1798         int slot;
1799
1800         eb = path->nodes[0];
1801         slot = path->slots[0];
1802         if (slot > 0) {
1803                 btrfs_item_key(eb, &disk_key, slot - 1);
1804                 if (comp_keys(&disk_key, new_key) >= 0)
1805                         return -1;
1806         }
1807         if (slot < btrfs_header_nritems(eb) - 1) {
1808                 btrfs_item_key(eb, &disk_key, slot + 1);
1809                 if (comp_keys(&disk_key, new_key) <= 0)
1810                         return -1;
1811         }
1812
1813         btrfs_cpu_key_to_disk(&disk_key, new_key);
1814         btrfs_set_item_key(eb, &disk_key, slot);
1815         btrfs_mark_buffer_dirty(eb);
1816         if (slot == 0)
1817                 fixup_low_keys(trans, root, path, &disk_key, 1);
1818         return 0;
1819 }
1820
1821 /*
1822  * try to push data from one node into the next node left in the
1823  * tree.
1824  *
1825  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1826  * error, and > 0 if there was no room in the left hand block.
1827  */
1828 static int push_node_left(struct btrfs_trans_handle *trans,
1829                           struct btrfs_root *root, struct extent_buffer *dst,
1830                           struct extent_buffer *src, int empty)
1831 {
1832         int push_items = 0;
1833         int src_nritems;
1834         int dst_nritems;
1835         int ret = 0;
1836
1837         src_nritems = btrfs_header_nritems(src);
1838         dst_nritems = btrfs_header_nritems(dst);
1839         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1840         WARN_ON(btrfs_header_generation(src) != trans->transid);
1841         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1842
1843         if (!empty && src_nritems <= 8)
1844                 return 1;
1845
1846         if (push_items <= 0)
1847                 return 1;
1848
1849         if (empty) {
1850                 push_items = min(src_nritems, push_items);
1851                 if (push_items < src_nritems) {
1852                         /* leave at least 8 pointers in the node if
1853                          * we aren't going to empty it
1854                          */
1855                         if (src_nritems - push_items < 8) {
1856                                 if (push_items <= 8)
1857                                         return 1;
1858                                 push_items -= 8;
1859                         }
1860                 }
1861         } else
1862                 push_items = min(src_nritems - 8, push_items);
1863
1864         copy_extent_buffer(dst, src,
1865                            btrfs_node_key_ptr_offset(dst_nritems),
1866                            btrfs_node_key_ptr_offset(0),
1867                            push_items * sizeof(struct btrfs_key_ptr));
1868
1869         if (push_items < src_nritems) {
1870                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1871                                       btrfs_node_key_ptr_offset(push_items),
1872                                       (src_nritems - push_items) *
1873                                       sizeof(struct btrfs_key_ptr));
1874         }
1875         btrfs_set_header_nritems(src, src_nritems - push_items);
1876         btrfs_set_header_nritems(dst, dst_nritems + push_items);
1877         btrfs_mark_buffer_dirty(src);
1878         btrfs_mark_buffer_dirty(dst);
1879
1880         return ret;
1881 }
1882
1883 /*
1884  * try to push data from one node into the next node right in the
1885  * tree.
1886  *
1887  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1888  * error, and > 0 if there was no room in the right hand block.
1889  *
1890  * this will  only push up to 1/2 the contents of the left node over
1891  */
1892 static int balance_node_right(struct btrfs_trans_handle *trans,
1893                               struct btrfs_root *root,
1894                               struct extent_buffer *dst,
1895                               struct extent_buffer *src)
1896 {
1897         int push_items = 0;
1898         int max_push;
1899         int src_nritems;
1900         int dst_nritems;
1901         int ret = 0;
1902
1903         WARN_ON(btrfs_header_generation(src) != trans->transid);
1904         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1905
1906         src_nritems = btrfs_header_nritems(src);
1907         dst_nritems = btrfs_header_nritems(dst);
1908         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1909         if (push_items <= 0)
1910                 return 1;
1911
1912         if (src_nritems < 4)
1913                 return 1;
1914
1915         max_push = src_nritems / 2 + 1;
1916         /* don't try to empty the node */
1917         if (max_push >= src_nritems)
1918                 return 1;
1919
1920         if (max_push < push_items)
1921                 push_items = max_push;
1922
1923         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1924                                       btrfs_node_key_ptr_offset(0),
1925                                       (dst_nritems) *
1926                                       sizeof(struct btrfs_key_ptr));
1927
1928         copy_extent_buffer(dst, src,
1929                            btrfs_node_key_ptr_offset(0),
1930                            btrfs_node_key_ptr_offset(src_nritems - push_items),
1931                            push_items * sizeof(struct btrfs_key_ptr));
1932
1933         btrfs_set_header_nritems(src, src_nritems - push_items);
1934         btrfs_set_header_nritems(dst, dst_nritems + push_items);
1935
1936         btrfs_mark_buffer_dirty(src);
1937         btrfs_mark_buffer_dirty(dst);
1938
1939         return ret;
1940 }
1941
1942 /*
1943  * helper function to insert a new root level in the tree.
1944  * A new node is allocated, and a single item is inserted to
1945  * point to the existing root
1946  *
1947  * returns zero on success or < 0 on failure.
1948  */
1949 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
1950                            struct btrfs_root *root,
1951                            struct btrfs_path *path, int level)
1952 {
1953         u64 lower_gen;
1954         struct extent_buffer *lower;
1955         struct extent_buffer *c;
1956         struct extent_buffer *old;
1957         struct btrfs_disk_key lower_key;
1958
1959         BUG_ON(path->nodes[level]);
1960         BUG_ON(path->nodes[level-1] != root->node);
1961
1962         lower = path->nodes[level-1];
1963         if (level == 1)
1964                 btrfs_item_key(lower, &lower_key, 0);
1965         else
1966                 btrfs_node_key(lower, &lower_key, 0);
1967
1968         c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1969                                    root->root_key.objectid, &lower_key,
1970                                    level, root->node->start, 0);
1971         if (IS_ERR(c))
1972                 return PTR_ERR(c);
1973
1974         root_add_used(root, root->nodesize);
1975
1976         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
1977         btrfs_set_header_nritems(c, 1);
1978         btrfs_set_header_level(c, level);
1979         btrfs_set_header_bytenr(c, c->start);
1980         btrfs_set_header_generation(c, trans->transid);
1981         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
1982         btrfs_set_header_owner(c, root->root_key.objectid);
1983
1984         write_extent_buffer(c, root->fs_info->fsid,
1985                             (unsigned long)btrfs_header_fsid(c),
1986                             BTRFS_FSID_SIZE);
1987
1988         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
1989                             (unsigned long)btrfs_header_chunk_tree_uuid(c),
1990                             BTRFS_UUID_SIZE);
1991
1992         btrfs_set_node_key(c, &lower_key, 0);
1993         btrfs_set_node_blockptr(c, 0, lower->start);
1994         lower_gen = btrfs_header_generation(lower);
1995         WARN_ON(lower_gen != trans->transid);
1996
1997         btrfs_set_node_ptr_generation(c, 0, lower_gen);
1998
1999         btrfs_mark_buffer_dirty(c);
2000
2001         old = root->node;
2002         rcu_assign_pointer(root->node, c);
2003
2004         /* the super has an extra ref to root->node */
2005         free_extent_buffer(old);
2006
2007         add_root_to_dirty_list(root);
2008         extent_buffer_get(c);
2009         path->nodes[level] = c;
2010         path->locks[level] = 1;
2011         path->slots[level] = 0;
2012         return 0;
2013 }
2014
2015 /*
2016  * worker function to insert a single pointer in a node.
2017  * the node should have enough room for the pointer already
2018  *
2019  * slot and level indicate where you want the key to go, and
2020  * blocknr is the block the key points to.
2021  *
2022  * returns zero on success and < 0 on any error
2023  */
2024 static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2025                       *root, struct btrfs_path *path, struct btrfs_disk_key
2026                       *key, u64 bytenr, int slot, int level)
2027 {
2028         struct extent_buffer *lower;
2029         int nritems;
2030
2031         BUG_ON(!path->nodes[level]);
2032         btrfs_assert_tree_locked(path->nodes[level]);
2033         lower = path->nodes[level];
2034         nritems = btrfs_header_nritems(lower);
2035         BUG_ON(slot > nritems);
2036         if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2037                 BUG();
2038         if (slot != nritems) {
2039                 memmove_extent_buffer(lower,
2040                               btrfs_node_key_ptr_offset(slot + 1),
2041                               btrfs_node_key_ptr_offset(slot),
2042                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2043         }
2044         btrfs_set_node_key(lower, key, slot);
2045         btrfs_set_node_blockptr(lower, slot, bytenr);
2046         WARN_ON(trans->transid == 0);
2047         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2048         btrfs_set_header_nritems(lower, nritems + 1);
2049         btrfs_mark_buffer_dirty(lower);
2050         return 0;
2051 }
2052
2053 /*
2054  * split the node at the specified level in path in two.
2055  * The path is corrected to point to the appropriate node after the split
2056  *
2057  * Before splitting this tries to make some room in the node by pushing
2058  * left and right, if either one works, it returns right away.
2059  *
2060  * returns 0 on success and < 0 on failure
2061  */
2062 static noinline int split_node(struct btrfs_trans_handle *trans,
2063                                struct btrfs_root *root,
2064                                struct btrfs_path *path, int level)
2065 {
2066         struct extent_buffer *c;
2067         struct extent_buffer *split;
2068         struct btrfs_disk_key disk_key;
2069         int mid;
2070         int ret;
2071         int wret;
2072         u32 c_nritems;
2073
2074         c = path->nodes[level];
2075         WARN_ON(btrfs_header_generation(c) != trans->transid);
2076         if (c == root->node) {
2077                 /* trying to split the root, lets make a new one */
2078                 ret = insert_new_root(trans, root, path, level + 1);
2079                 if (ret)
2080                         return ret;
2081         } else {
2082                 ret = push_nodes_for_insert(trans, root, path, level);
2083                 c = path->nodes[level];
2084                 if (!ret && btrfs_header_nritems(c) <
2085                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2086                         return 0;
2087                 if (ret < 0)
2088                         return ret;
2089         }
2090
2091         c_nritems = btrfs_header_nritems(c);
2092         mid = (c_nritems + 1) / 2;
2093         btrfs_node_key(c, &disk_key, mid);
2094
2095         split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2096                                         root->root_key.objectid,
2097                                         &disk_key, level, c->start, 0);
2098         if (IS_ERR(split))
2099                 return PTR_ERR(split);
2100
2101         root_add_used(root, root->nodesize);
2102
2103         memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2104         btrfs_set_header_level(split, btrfs_header_level(c));
2105         btrfs_set_header_bytenr(split, split->start);
2106         btrfs_set_header_generation(split, trans->transid);
2107         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
2108         btrfs_set_header_owner(split, root->root_key.objectid);
2109         write_extent_buffer(split, root->fs_info->fsid,
2110                             (unsigned long)btrfs_header_fsid(split),
2111                             BTRFS_FSID_SIZE);
2112         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2113                             (unsigned long)btrfs_header_chunk_tree_uuid(split),
2114                             BTRFS_UUID_SIZE);
2115
2116
2117         copy_extent_buffer(split, c,
2118                            btrfs_node_key_ptr_offset(0),
2119                            btrfs_node_key_ptr_offset(mid),
2120                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2121         btrfs_set_header_nritems(split, c_nritems - mid);
2122         btrfs_set_header_nritems(c, mid);
2123         ret = 0;
2124
2125         btrfs_mark_buffer_dirty(c);
2126         btrfs_mark_buffer_dirty(split);
2127
2128         wret = insert_ptr(trans, root, path, &disk_key, split->start,
2129                           path->slots[level + 1] + 1,
2130                           level + 1);
2131         if (wret)
2132                 ret = wret;
2133
2134         if (path->slots[level] >= mid) {
2135                 path->slots[level] -= mid;
2136                 btrfs_tree_unlock(c);
2137                 free_extent_buffer(c);
2138                 path->nodes[level] = split;
2139                 path->slots[level + 1] += 1;
2140         } else {
2141                 btrfs_tree_unlock(split);
2142                 free_extent_buffer(split);
2143         }
2144         return ret;
2145 }
2146
2147 /*
2148  * how many bytes are required to store the items in a leaf.  start
2149  * and nr indicate which items in the leaf to check.  This totals up the
2150  * space used both by the item structs and the item data
2151  */
2152 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2153 {
2154         int data_len;
2155         int nritems = btrfs_header_nritems(l);
2156         int end = min(nritems, start + nr) - 1;
2157
2158         if (!nr)
2159                 return 0;
2160         data_len = btrfs_item_end_nr(l, start);
2161         data_len = data_len - btrfs_item_offset_nr(l, end);
2162         data_len += sizeof(struct btrfs_item) * nr;
2163         WARN_ON(data_len < 0);
2164         return data_len;
2165 }
2166
2167 /*
2168  * The space between the end of the leaf items and
2169  * the start of the leaf data.  IOW, how much room
2170  * the leaf has left for both items and data
2171  */
2172 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2173                                    struct extent_buffer *leaf)
2174 {
2175         int nritems = btrfs_header_nritems(leaf);
2176         int ret;
2177         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2178         if (ret < 0) {
2179                 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2180                        "used %d nritems %d\n",
2181                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2182                        leaf_space_used(leaf, 0, nritems), nritems);
2183         }
2184         return ret;
2185 }
2186
2187 /*
2188  * min slot controls the lowest index we're willing to push to the
2189  * right.  We'll push up to and including min_slot, but no lower
2190  */
2191 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2192                                       struct btrfs_root *root,
2193                                       struct btrfs_path *path,
2194                                       int data_size, int empty,
2195                                       struct extent_buffer *right,
2196                                       int free_space, u32 left_nritems,
2197                                       u32 min_slot)
2198 {
2199         struct extent_buffer *left = path->nodes[0];
2200         struct extent_buffer *upper = path->nodes[1];
2201         struct btrfs_disk_key disk_key;
2202         int slot;
2203         u32 i;
2204         int push_space = 0;
2205         int push_items = 0;
2206         struct btrfs_item *item;
2207         u32 nr;
2208         u32 right_nritems;
2209         u32 data_end;
2210         u32 this_item_size;
2211
2212         if (empty)
2213                 nr = 0;
2214         else
2215                 nr = max_t(u32, 1, min_slot);
2216
2217         if (path->slots[0] >= left_nritems)
2218                 push_space += data_size;
2219
2220         slot = path->slots[1];
2221         i = left_nritems - 1;
2222         while (i >= nr) {
2223                 item = btrfs_item_nr(left, i);
2224
2225                 if (!empty && push_items > 0) {
2226                         if (path->slots[0] > i)
2227                                 break;
2228                         if (path->slots[0] == i) {
2229                                 int space = btrfs_leaf_free_space(root, left);
2230                                 if (space + push_space * 2 > free_space)
2231                                         break;
2232                         }
2233                 }
2234
2235                 if (path->slots[0] == i)
2236                         push_space += data_size;
2237
2238                 if (!left->map_token) {
2239                         map_extent_buffer(left, (unsigned long)item,
2240                                         sizeof(struct btrfs_item),
2241                                         &left->map_token, &left->kaddr,
2242                                         &left->map_start, &left->map_len,
2243                                         KM_USER1);
2244                 }
2245
2246                 this_item_size = btrfs_item_size(left, item);
2247                 if (this_item_size + sizeof(*item) + push_space > free_space)
2248                         break;
2249
2250                 push_items++;
2251                 push_space += this_item_size + sizeof(*item);
2252                 if (i == 0)
2253                         break;
2254                 i--;
2255         }
2256         if (left->map_token) {
2257                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2258                 left->map_token = NULL;
2259         }
2260
2261         if (push_items == 0)
2262                 goto out_unlock;
2263
2264         if (!empty && push_items == left_nritems)
2265                 WARN_ON(1);
2266
2267         /* push left to right */
2268         right_nritems = btrfs_header_nritems(right);
2269
2270         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2271         push_space -= leaf_data_end(root, left);
2272
2273         /* make room in the right data area */
2274         data_end = leaf_data_end(root, right);
2275         memmove_extent_buffer(right,
2276                               btrfs_leaf_data(right) + data_end - push_space,
2277                               btrfs_leaf_data(right) + data_end,
2278                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
2279
2280         /* copy from the left data area */
2281         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2282                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
2283                      btrfs_leaf_data(left) + leaf_data_end(root, left),
2284                      push_space);
2285
2286         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2287                               btrfs_item_nr_offset(0),
2288                               right_nritems * sizeof(struct btrfs_item));
2289
2290         /* copy the items from left to right */
2291         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2292                    btrfs_item_nr_offset(left_nritems - push_items),
2293                    push_items * sizeof(struct btrfs_item));
2294
2295         /* update the item pointers */
2296         right_nritems += push_items;
2297         btrfs_set_header_nritems(right, right_nritems);
2298         push_space = BTRFS_LEAF_DATA_SIZE(root);
2299         for (i = 0; i < right_nritems; i++) {
2300                 item = btrfs_item_nr(right, i);
2301                 if (!right->map_token) {
2302                         map_extent_buffer(right, (unsigned long)item,
2303                                         sizeof(struct btrfs_item),
2304                                         &right->map_token, &right->kaddr,
2305                                         &right->map_start, &right->map_len,
2306                                         KM_USER1);
2307                 }
2308                 push_space -= btrfs_item_size(right, item);
2309                 btrfs_set_item_offset(right, item, push_space);
2310         }
2311
2312         if (right->map_token) {
2313                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2314                 right->map_token = NULL;
2315         }
2316         left_nritems -= push_items;
2317         btrfs_set_header_nritems(left, left_nritems);
2318
2319         if (left_nritems)
2320                 btrfs_mark_buffer_dirty(left);
2321         else
2322                 clean_tree_block(trans, root, left);
2323
2324         btrfs_mark_buffer_dirty(right);
2325
2326         btrfs_item_key(right, &disk_key, 0);
2327         btrfs_set_node_key(upper, &disk_key, slot + 1);
2328         btrfs_mark_buffer_dirty(upper);
2329
2330         /* then fixup the leaf pointer in the path */
2331         if (path->slots[0] >= left_nritems) {
2332                 path->slots[0] -= left_nritems;
2333                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2334                         clean_tree_block(trans, root, path->nodes[0]);
2335                 btrfs_tree_unlock(path->nodes[0]);
2336                 free_extent_buffer(path->nodes[0]);
2337                 path->nodes[0] = right;
2338                 path->slots[1] += 1;
2339         } else {
2340                 btrfs_tree_unlock(right);
2341                 free_extent_buffer(right);
2342         }
2343         return 0;
2344
2345 out_unlock:
2346         btrfs_tree_unlock(right);
2347         free_extent_buffer(right);
2348         return 1;
2349 }
2350
2351 /*
2352  * push some data in the path leaf to the right, trying to free up at
2353  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2354  *
2355  * returns 1 if the push failed because the other node didn't have enough
2356  * room, 0 if everything worked out and < 0 if there were major errors.
2357  *
2358  * this will push starting from min_slot to the end of the leaf.  It won't
2359  * push any slot lower than min_slot
2360  */
2361 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2362                            *root, struct btrfs_path *path,
2363                            int min_data_size, int data_size,
2364                            int empty, u32 min_slot)
2365 {
2366         struct extent_buffer *left = path->nodes[0];
2367         struct extent_buffer *right;
2368         struct extent_buffer *upper;
2369         int slot;
2370         int free_space;
2371         u32 left_nritems;
2372         int ret;
2373
2374         if (!path->nodes[1])
2375                 return 1;
2376
2377         slot = path->slots[1];
2378         upper = path->nodes[1];
2379         if (slot >= btrfs_header_nritems(upper) - 1)
2380                 return 1;
2381
2382         btrfs_assert_tree_locked(path->nodes[1]);
2383
2384         right = read_node_slot(root, upper, slot + 1);
2385         if (right == NULL)
2386                 return 1;
2387
2388         btrfs_tree_lock(right);
2389         btrfs_set_lock_blocking(right);
2390
2391         free_space = btrfs_leaf_free_space(root, right);
2392         if (free_space < data_size)
2393                 goto out_unlock;
2394
2395         /* cow and double check */
2396         ret = btrfs_cow_block(trans, root, right, upper,
2397                               slot + 1, &right);
2398         if (ret)
2399                 goto out_unlock;
2400
2401         free_space = btrfs_leaf_free_space(root, right);
2402         if (free_space < data_size)
2403                 goto out_unlock;
2404
2405         left_nritems = btrfs_header_nritems(left);
2406         if (left_nritems == 0)
2407                 goto out_unlock;
2408
2409         return __push_leaf_right(trans, root, path, min_data_size, empty,
2410                                 right, free_space, left_nritems, min_slot);
2411 out_unlock:
2412         btrfs_tree_unlock(right);
2413         free_extent_buffer(right);
2414         return 1;
2415 }
2416
2417 /*
2418  * push some data in the path leaf to the left, trying to free up at
2419  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2420  *
2421  * max_slot can put a limit on how far into the leaf we'll push items.  The
2422  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
2423  * items
2424  */
2425 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2426                                      struct btrfs_root *root,
2427                                      struct btrfs_path *path, int data_size,
2428                                      int empty, struct extent_buffer *left,
2429                                      int free_space, u32 right_nritems,
2430                                      u32 max_slot)
2431 {
2432         struct btrfs_disk_key disk_key;
2433         struct extent_buffer *right = path->nodes[0];
2434         int i;
2435         int push_space = 0;
2436         int push_items = 0;
2437         struct btrfs_item *item;
2438         u32 old_left_nritems;
2439         u32 nr;
2440         int ret = 0;
2441         int wret;
2442         u32 this_item_size;
2443         u32 old_left_item_size;
2444
2445         if (empty)
2446                 nr = min(right_nritems, max_slot);
2447         else
2448                 nr = min(right_nritems - 1, max_slot);
2449
2450         for (i = 0; i < nr; i++) {
2451                 item = btrfs_item_nr(right, i);
2452                 if (!right->map_token) {
2453                         map_extent_buffer(right, (unsigned long)item,
2454                                         sizeof(struct btrfs_item),
2455                                         &right->map_token, &right->kaddr,
2456                                         &right->map_start, &right->map_len,
2457                                         KM_USER1);
2458                 }
2459
2460                 if (!empty && push_items > 0) {
2461                         if (path->slots[0] < i)
2462                                 break;
2463                         if (path->slots[0] == i) {
2464                                 int space = btrfs_leaf_free_space(root, right);
2465                                 if (space + push_space * 2 > free_space)
2466                                         break;
2467                         }
2468                 }
2469
2470                 if (path->slots[0] == i)
2471                         push_space += data_size;
2472
2473                 this_item_size = btrfs_item_size(right, item);
2474                 if (this_item_size + sizeof(*item) + push_space > free_space)
2475                         break;
2476
2477                 push_items++;
2478                 push_space += this_item_size + sizeof(*item);
2479         }
2480
2481         if (right->map_token) {
2482                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2483                 right->map_token = NULL;
2484         }
2485
2486         if (push_items == 0) {
2487                 ret = 1;
2488                 goto out;
2489         }
2490         if (!empty && push_items == btrfs_header_nritems(right))
2491                 WARN_ON(1);
2492
2493         /* push data from right to left */
2494         copy_extent_buffer(left, right,
2495                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
2496                            btrfs_item_nr_offset(0),
2497                            push_items * sizeof(struct btrfs_item));
2498
2499         push_space = BTRFS_LEAF_DATA_SIZE(root) -
2500                      btrfs_item_offset_nr(right, push_items - 1);
2501
2502         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2503                      leaf_data_end(root, left) - push_space,
2504                      btrfs_leaf_data(right) +
2505                      btrfs_item_offset_nr(right, push_items - 1),
2506                      push_space);
2507         old_left_nritems = btrfs_header_nritems(left);
2508         BUG_ON(old_left_nritems <= 0);
2509
2510         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2511         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2512                 u32 ioff;
2513
2514                 item = btrfs_item_nr(left, i);
2515                 if (!left->map_token) {
2516                         map_extent_buffer(left, (unsigned long)item,
2517                                         sizeof(struct btrfs_item),
2518                                         &left->map_token, &left->kaddr,
2519                                         &left->map_start, &left->map_len,
2520                                         KM_USER1);
2521                 }
2522
2523                 ioff = btrfs_item_offset(left, item);
2524                 btrfs_set_item_offset(left, item,
2525                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2526         }
2527         btrfs_set_header_nritems(left, old_left_nritems + push_items);
2528         if (left->map_token) {
2529                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2530                 left->map_token = NULL;
2531         }
2532
2533         /* fixup right node */
2534         if (push_items > right_nritems) {
2535                 printk(KERN_CRIT "push items %d nr %u\n", push_items,
2536                        right_nritems);
2537                 WARN_ON(1);
2538         }
2539
2540         if (push_items < right_nritems) {
2541                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2542                                                   leaf_data_end(root, right);
2543                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2544                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
2545                                       btrfs_leaf_data(right) +
2546                                       leaf_data_end(root, right), push_space);
2547
2548                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2549                               btrfs_item_nr_offset(push_items),
2550                              (btrfs_header_nritems(right) - push_items) *
2551                              sizeof(struct btrfs_item));
2552         }
2553         right_nritems -= push_items;
2554         btrfs_set_header_nritems(right, right_nritems);
2555         push_space = BTRFS_LEAF_DATA_SIZE(root);
2556         for (i = 0; i < right_nritems; i++) {
2557                 item = btrfs_item_nr(right, i);
2558
2559                 if (!right->map_token) {
2560                         map_extent_buffer(right, (unsigned long)item,
2561                                         sizeof(struct btrfs_item),
2562                                         &right->map_token, &right->kaddr,
2563                                         &right->map_start, &right->map_len,
2564                                         KM_USER1);
2565                 }
2566
2567                 push_space = push_space - btrfs_item_size(right, item);
2568                 btrfs_set_item_offset(right, item, push_space);
2569         }
2570         if (right->map_token) {
2571                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2572                 right->map_token = NULL;
2573         }
2574
2575         btrfs_mark_buffer_dirty(left);
2576         if (right_nritems)
2577                 btrfs_mark_buffer_dirty(right);
2578         else
2579                 clean_tree_block(trans, root, right);
2580
2581         btrfs_item_key(right, &disk_key, 0);
2582         wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2583         if (wret)
2584                 ret = wret;
2585
2586         /* then fixup the leaf pointer in the path */
2587         if (path->slots[0] < push_items) {
2588                 path->slots[0] += old_left_nritems;
2589                 btrfs_tree_unlock(path->nodes[0]);
2590                 free_extent_buffer(path->nodes[0]);
2591                 path->nodes[0] = left;
2592                 path->slots[1] -= 1;
2593         } else {
2594                 btrfs_tree_unlock(left);
2595                 free_extent_buffer(left);
2596                 path->slots[0] -= push_items;
2597         }
2598         BUG_ON(path->slots[0] < 0);
2599         return ret;
2600 out:
2601         btrfs_tree_unlock(left);
2602         free_extent_buffer(left);
2603         return ret;
2604 }
2605
2606 /*
2607  * push some data in the path leaf to the left, trying to free up at
2608  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2609  *
2610  * max_slot can put a limit on how far into the leaf we'll push items.  The
2611  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
2612  * items
2613  */
2614 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2615                           *root, struct btrfs_path *path, int min_data_size,
2616                           int data_size, int empty, u32 max_slot)
2617 {
2618         struct extent_buffer *right = path->nodes[0];
2619         struct extent_buffer *left;
2620         int slot;
2621         int free_space;
2622         u32 right_nritems;
2623         int ret = 0;
2624
2625         slot = path->slots[1];
2626         if (slot == 0)
2627                 return 1;
2628         if (!path->nodes[1])
2629                 return 1;
2630
2631         right_nritems = btrfs_header_nritems(right);
2632         if (right_nritems == 0)
2633                 return 1;
2634
2635         btrfs_assert_tree_locked(path->nodes[1]);
2636
2637         left = read_node_slot(root, path->nodes[1], slot - 1);
2638         if (left == NULL)
2639                 return 1;
2640
2641         btrfs_tree_lock(left);
2642         btrfs_set_lock_blocking(left);
2643
2644         free_space = btrfs_leaf_free_space(root, left);
2645         if (free_space < data_size) {
2646                 ret = 1;
2647                 goto out;
2648         }
2649
2650         /* cow and double check */
2651         ret = btrfs_cow_block(trans, root, left,
2652                               path->nodes[1], slot - 1, &left);
2653         if (ret) {
2654                 /* we hit -ENOSPC, but it isn't fatal here */
2655                 ret = 1;
2656                 goto out;
2657         }
2658
2659         free_space = btrfs_leaf_free_space(root, left);
2660         if (free_space < data_size) {
2661                 ret = 1;
2662                 goto out;
2663         }
2664
2665         return __push_leaf_left(trans, root, path, min_data_size,
2666                                empty, left, free_space, right_nritems,
2667                                max_slot);
2668 out:
2669         btrfs_tree_unlock(left);
2670         free_extent_buffer(left);
2671         return ret;
2672 }
2673
2674 /*
2675  * split the path's leaf in two, making sure there is at least data_size
2676  * available for the resulting leaf level of the path.
2677  *
2678  * returns 0 if all went well and < 0 on failure.
2679  */
2680 static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2681                                struct btrfs_root *root,
2682                                struct btrfs_path *path,
2683                                struct extent_buffer *l,
2684                                struct extent_buffer *right,
2685                                int slot, int mid, int nritems)
2686 {
2687         int data_copy_size;
2688         int rt_data_off;
2689         int i;
2690         int ret = 0;
2691         int wret;
2692         struct btrfs_disk_key disk_key;
2693
2694         nritems = nritems - mid;
2695         btrfs_set_header_nritems(right, nritems);
2696         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2697
2698         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2699                            btrfs_item_nr_offset(mid),
2700                            nritems * sizeof(struct btrfs_item));
2701
2702         copy_extent_buffer(right, l,
2703                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2704                      data_copy_size, btrfs_leaf_data(l) +
2705                      leaf_data_end(root, l), data_copy_size);
2706
2707         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2708                       btrfs_item_end_nr(l, mid);
2709
2710         for (i = 0; i < nritems; i++) {
2711                 struct btrfs_item *item = btrfs_item_nr(right, i);
2712                 u32 ioff;
2713
2714                 if (!right->map_token) {
2715                         map_extent_buffer(right, (unsigned long)item,
2716                                         sizeof(struct btrfs_item),
2717                                         &right->map_token, &right->kaddr,
2718                                         &right->map_start, &right->map_len,
2719                                         KM_USER1);
2720                 }
2721
2722                 ioff = btrfs_item_offset(right, item);
2723                 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2724         }
2725
2726         if (right->map_token) {
2727                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2728                 right->map_token = NULL;
2729         }
2730
2731         btrfs_set_header_nritems(l, mid);
2732         ret = 0;
2733         btrfs_item_key(right, &disk_key, 0);
2734         wret = insert_ptr(trans, root, path, &disk_key, right->start,
2735                           path->slots[1] + 1, 1);
2736         if (wret)
2737                 ret = wret;
2738
2739         btrfs_mark_buffer_dirty(right);
2740         btrfs_mark_buffer_dirty(l);
2741         BUG_ON(path->slots[0] != slot);
2742
2743         if (mid <= slot) {
2744                 btrfs_tree_unlock(path->nodes[0]);
2745                 free_extent_buffer(path->nodes[0]);
2746                 path->nodes[0] = right;
2747                 path->slots[0] -= mid;
2748                 path->slots[1] += 1;
2749         } else {
2750                 btrfs_tree_unlock(right);
2751                 free_extent_buffer(right);
2752         }
2753
2754         BUG_ON(path->slots[0] < 0);
2755
2756         return ret;
2757 }
2758
2759 /*
2760  * double splits happen when we need to insert a big item in the middle
2761  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
2762  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
2763  *          A                 B                 C
2764  *
2765  * We avoid this by trying to push the items on either side of our target
2766  * into the adjacent leaves.  If all goes well we can avoid the double split
2767  * completely.
2768  */
2769 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
2770                                           struct btrfs_root *root,
2771                                           struct btrfs_path *path,
2772                                           int data_size)
2773 {
2774         int ret;
2775         int progress = 0;
2776         int slot;
2777         u32 nritems;
2778
2779         slot = path->slots[0];
2780
2781         /*
2782          * try to push all the items after our slot into the
2783          * right leaf
2784          */
2785         ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
2786         if (ret < 0)
2787                 return ret;
2788
2789         if (ret == 0)
2790                 progress++;
2791
2792         nritems = btrfs_header_nritems(path->nodes[0]);
2793         /*
2794          * our goal is to get our slot at the start or end of a leaf.  If
2795          * we've done so we're done
2796          */
2797         if (path->slots[0] == 0 || path->slots[0] == nritems)
2798                 return 0;
2799
2800         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2801                 return 0;
2802
2803         /* try to push all the items before our slot into the next leaf */
2804         slot = path->slots[0];
2805         ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
2806         if (ret < 0)
2807                 return ret;
2808
2809         if (ret == 0)
2810                 progress++;
2811
2812         if (progress)
2813                 return 0;
2814         return 1;
2815 }
2816
2817 /*
2818  * split the path's leaf in two, making sure there is at least data_size
2819  * available for the resulting leaf level of the path.
2820  *
2821  * returns 0 if all went well and < 0 on failure.
2822  */
2823 static noinline int split_leaf(struct btrfs_trans_handle *trans,
2824                                struct btrfs_root *root,
2825                                struct btrfs_key *ins_key,
2826                                struct btrfs_path *path, int data_size,
2827                                int extend)
2828 {
2829         struct btrfs_disk_key disk_key;
2830         struct extent_buffer *l;
2831         u32 nritems;
2832         int mid;
2833         int slot;
2834         struct extent_buffer *right;
2835         int ret = 0;
2836         int wret;
2837         int split;
2838         int num_doubles = 0;
2839         int tried_avoid_double = 0;
2840
2841         l = path->nodes[0];
2842         slot = path->slots[0];
2843         if (extend && data_size + btrfs_item_size_nr(l, slot) +
2844             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
2845                 return -EOVERFLOW;
2846
2847         /* first try to make some room by pushing left and right */
2848         if (data_size) {
2849                 wret = push_leaf_right(trans, root, path, data_size,
2850                                        data_size, 0, 0);
2851                 if (wret < 0)
2852                         return wret;
2853                 if (wret) {
2854                         wret = push_leaf_left(trans, root, path, data_size,
2855                                               data_size, 0, (u32)-1);
2856                         if (wret < 0)
2857                                 return wret;
2858                 }
2859                 l = path->nodes[0];
2860
2861                 /* did the pushes work? */
2862                 if (btrfs_leaf_free_space(root, l) >= data_size)
2863                         return 0;
2864         }
2865
2866         if (!path->nodes[1]) {
2867                 ret = insert_new_root(trans, root, path, 1);
2868                 if (ret)
2869                         return ret;
2870         }
2871 again:
2872         split = 1;
2873         l = path->nodes[0];
2874         slot = path->slots[0];
2875         nritems = btrfs_header_nritems(l);
2876         mid = (nritems + 1) / 2;
2877
2878         if (mid <= slot) {
2879                 if (nritems == 1 ||
2880                     leaf_space_used(l, mid, nritems - mid) + data_size >
2881                         BTRFS_LEAF_DATA_SIZE(root)) {
2882                         if (slot >= nritems) {
2883                                 split = 0;
2884                         } else {
2885                                 mid = slot;
2886                                 if (mid != nritems &&
2887                                     leaf_space_used(l, mid, nritems - mid) +
2888                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2889                                         if (data_size && !tried_avoid_double)
2890                                                 goto push_for_double;
2891                                         split = 2;
2892                                 }
2893                         }
2894                 }
2895         } else {
2896                 if (leaf_space_used(l, 0, mid) + data_size >
2897                         BTRFS_LEAF_DATA_SIZE(root)) {
2898                         if (!extend && data_size && slot == 0) {
2899                                 split = 0;
2900                         } else if ((extend || !data_size) && slot == 0) {
2901                                 mid = 1;
2902                         } else {
2903                                 mid = slot;
2904                                 if (mid != nritems &&
2905                                     leaf_space_used(l, mid, nritems - mid) +
2906                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2907                                         if (data_size && !tried_avoid_double)
2908                                                 goto push_for_double;
2909                                         split = 2 ;
2910                                 }
2911                         }
2912                 }
2913         }
2914
2915         if (split == 0)
2916                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2917         else
2918                 btrfs_item_key(l, &disk_key, mid);
2919
2920         right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2921                                         root->root_key.objectid,
2922                                         &disk_key, 0, l->start, 0);
2923         if (IS_ERR(right))
2924                 return PTR_ERR(right);
2925
2926         root_add_used(root, root->leafsize);
2927
2928         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2929         btrfs_set_header_bytenr(right, right->start);
2930         btrfs_set_header_generation(right, trans->transid);
2931         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
2932         btrfs_set_header_owner(right, root->root_key.objectid);
2933         btrfs_set_header_level(right, 0);
2934         write_extent_buffer(right, root->fs_info->fsid,
2935                             (unsigned long)btrfs_header_fsid(right),
2936                             BTRFS_FSID_SIZE);
2937
2938         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2939                             (unsigned long)btrfs_header_chunk_tree_uuid(right),
2940                             BTRFS_UUID_SIZE);
2941
2942         if (split == 0) {
2943                 if (mid <= slot) {
2944                         btrfs_set_header_nritems(right, 0);
2945                         wret = insert_ptr(trans, root, path,
2946                                           &disk_key, right->start,
2947                                           path->slots[1] + 1, 1);
2948                         if (wret)
2949                                 ret = wret;
2950
2951                         btrfs_tree_unlock(path->nodes[0]);
2952                         free_extent_buffer(path->nodes[0]);
2953                         path->nodes[0] = right;
2954                         path->slots[0] = 0;
2955                         path->slots[1] += 1;
2956                 } else {
2957                         btrfs_set_header_nritems(right, 0);
2958                         wret = insert_ptr(trans, root, path,
2959                                           &disk_key,
2960                                           right->start,
2961                                           path->slots[1], 1);
2962                         if (wret)
2963                                 ret = wret;
2964                         btrfs_tree_unlock(path->nodes[0]);
2965                         free_extent_buffer(path->nodes[0]);
2966                         path->nodes[0] = right;
2967                         path->slots[0] = 0;
2968                         if (path->slots[1] == 0) {
2969                                 wret = fixup_low_keys(trans, root,
2970                                                 path, &disk_key, 1);
2971                                 if (wret)
2972                                         ret = wret;
2973                         }
2974                 }
2975                 btrfs_mark_buffer_dirty(right);
2976                 return ret;
2977         }
2978
2979         ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
2980         BUG_ON(ret);
2981
2982         if (split == 2) {
2983                 BUG_ON(num_doubles != 0);
2984                 num_doubles++;
2985                 goto again;
2986         }
2987
2988         return ret;
2989
2990 push_for_double:
2991         push_for_double_split(trans, root, path, data_size);
2992         tried_avoid_double = 1;
2993         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2994                 return 0;
2995         goto again;
2996 }
2997
2998 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
2999                                          struct btrfs_root *root,
3000                                          struct btrfs_path *path, int ins_len)
3001 {
3002         struct btrfs_key key;
3003         struct extent_buffer *leaf;
3004         struct btrfs_file_extent_item *fi;
3005         u64 extent_len = 0;
3006         u32 item_size;
3007         int ret;
3008
3009         leaf = path->nodes[0];
3010         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3011
3012         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3013                key.type != BTRFS_EXTENT_CSUM_KEY);
3014
3015         if (btrfs_leaf_free_space(root, leaf) >= ins_len)
3016                 return 0;
3017
3018         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3019         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3020                 fi = btrfs_item_ptr(leaf, path->slots[0],
3021                                     struct btrfs_file_extent_item);
3022                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3023         }
3024         btrfs_release_path(path);
3025
3026         path->keep_locks = 1;
3027         path->search_for_split = 1;
3028         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3029         path->search_for_split = 0;
3030         if (ret < 0)
3031                 goto err;
3032
3033         ret = -EAGAIN;
3034         leaf = path->nodes[0];
3035         /* if our item isn't there or got smaller, return now */
3036         if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3037                 goto err;
3038
3039         /* the leaf has  changed, it now has room.  return now */
3040         if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3041                 goto err;
3042
3043         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3044                 fi = btrfs_item_ptr(leaf, path->slots[0],
3045                                     struct btrfs_file_extent_item);
3046                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3047                         goto err;
3048         }
3049
3050         btrfs_set_path_blocking(path);
3051         ret = split_leaf(trans, root, &key, path, ins_len, 1);
3052         if (ret)
3053                 goto err;
3054
3055         path->keep_locks = 0;
3056         btrfs_unlock_up_safe(path, 1);
3057         return 0;
3058 err:
3059         path->keep_locks = 0;
3060         return ret;
3061 }
3062
3063 static noinline int split_item(struct btrfs_trans_handle *trans,
3064                                struct btrfs_root *root,
3065                                struct btrfs_path *path,
3066                                struct btrfs_key *new_key,
3067                                unsigned long split_offset)
3068 {
3069         struct extent_buffer *leaf;
3070         struct btrfs_item *item;
3071         struct btrfs_item *new_item;
3072         int slot;
3073         char *buf;
3074         u32 nritems;
3075         u32 item_size;
3076         u32 orig_offset;
3077         struct btrfs_disk_key disk_key;
3078
3079         leaf = path->nodes[0];
3080         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3081
3082         btrfs_set_path_blocking(path);
3083
3084         item = btrfs_item_nr(leaf, path->slots[0]);
3085         orig_offset = btrfs_item_offset(leaf, item);
3086         item_size = btrfs_item_size(leaf, item);
3087
3088         buf = kmalloc(item_size, GFP_NOFS);
3089         if (!buf)
3090                 return -ENOMEM;
3091
3092         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3093                             path->slots[0]), item_size);
3094
3095         slot = path->slots[0] + 1;
3096         nritems = btrfs_header_nritems(leaf);
3097         if (slot != nritems) {
3098                 /* shift the items */
3099                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3100                                 btrfs_item_nr_offset(slot),
3101                                 (nritems - slot) * sizeof(struct btrfs_item));
3102         }
3103
3104         btrfs_cpu_key_to_disk(&disk_key, new_key);
3105         btrfs_set_item_key(leaf, &disk_key, slot);
3106
3107         new_item = btrfs_item_nr(leaf, slot);
3108
3109         btrfs_set_item_offset(leaf, new_item, orig_offset);
3110         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3111
3112         btrfs_set_item_offset(leaf, item,
3113                               orig_offset + item_size - split_offset);
3114         btrfs_set_item_size(leaf, item, split_offset);
3115
3116         btrfs_set_header_nritems(leaf, nritems + 1);
3117
3118         /* write the data for the start of the original item */
3119         write_extent_buffer(leaf, buf,
3120                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3121                             split_offset);
3122
3123         /* write the data for the new item */
3124         write_extent_buffer(leaf, buf + split_offset,
3125                             btrfs_item_ptr_offset(leaf, slot),
3126                             item_size - split_offset);
3127         btrfs_mark_buffer_dirty(leaf);
3128
3129         BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
3130         kfree(buf);
3131         return 0;
3132 }
3133
3134 /*
3135  * This function splits a single item into two items,
3136  * giving 'new_key' to the new item and splitting the
3137  * old one at split_offset (from the start of the item).
3138  *
3139  * The path may be released by this operation.  After
3140  * the split, the path is pointing to the old item.  The
3141  * new item is going to be in the same node as the old one.
3142  *
3143  * Note, the item being split must be smaller enough to live alone on
3144  * a tree block with room for one extra struct btrfs_item
3145  *
3146  * This allows us to split the item in place, keeping a lock on the
3147  * leaf the entire time.
3148  */
3149 int btrfs_split_item(struct btrfs_trans_handle *trans,
3150                      struct btrfs_root *root,
3151                      struct btrfs_path *path,
3152                      struct btrfs_key *new_key,
3153                      unsigned long split_offset)
3154 {
3155         int ret;
3156         ret = setup_leaf_for_split(trans, root, path,
3157                                    sizeof(struct btrfs_item));
3158         if (ret)
3159                 return ret;
3160
3161         ret = split_item(trans, root, path, new_key, split_offset);
3162         return ret;
3163 }
3164
3165 /*
3166  * This function duplicate a item, giving 'new_key' to the new item.
3167  * It guarantees both items live in the same tree leaf and the new item
3168  * is contiguous with the original item.
3169  *
3170  * This allows us to split file extent in place, keeping a lock on the
3171  * leaf the entire time.
3172  */
3173 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3174                          struct btrfs_root *root,
3175                          struct btrfs_path *path,
3176                          struct btrfs_key *new_key)
3177 {
3178         struct extent_buffer *leaf;
3179         int ret;
3180         u32 item_size;
3181
3182         leaf = path->nodes[0];
3183         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3184         ret = setup_leaf_for_split(trans, root, path,
3185                                    item_size + sizeof(struct btrfs_item));
3186         if (ret)
3187                 return ret;
3188
3189         path->slots[0]++;
3190         ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
3191                                      item_size, item_size +
3192                                      sizeof(struct btrfs_item), 1);
3193         BUG_ON(ret);
3194
3195         leaf = path->nodes[0];
3196         memcpy_extent_buffer(leaf,
3197                              btrfs_item_ptr_offset(leaf, path->slots[0]),
3198                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3199                              item_size);
3200         return 0;
3201 }
3202
3203 /*
3204  * make the item pointed to by the path smaller.  new_size indicates
3205  * how small to make it, and from_end tells us if we just chop bytes
3206  * off the end of the item or if we shift the item to chop bytes off
3207  * the front.
3208  */
3209 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3210                         struct btrfs_root *root,
3211                         struct btrfs_path *path,
3212                         u32 new_size, int from_end)
3213 {
3214         int slot;
3215         struct extent_buffer *leaf;
3216         struct btrfs_item *item;
3217         u32 nritems;
3218         unsigned int data_end;
3219         unsigned int old_data_start;
3220         unsigned int old_size;
3221         unsigned int size_diff;
3222         int i;
3223
3224         leaf = path->nodes[0];
3225         slot = path->slots[0];
3226
3227         old_size = btrfs_item_size_nr(leaf, slot);
3228         if (old_size == new_size)
3229                 return 0;
3230
3231         nritems = btrfs_header_nritems(leaf);
3232         data_end = leaf_data_end(root, leaf);
3233
3234         old_data_start = btrfs_item_offset_nr(leaf, slot);
3235
3236         size_diff = old_size - new_size;
3237
3238         BUG_ON(slot < 0);
3239         BUG_ON(slot >= nritems);
3240
3241         /*
3242          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3243          */
3244         /* first correct the data pointers */
3245         for (i = slot; i < nritems; i++) {
3246                 u32 ioff;
3247                 item = btrfs_item_nr(leaf, i);
3248
3249                 if (!leaf->map_token) {
3250                         map_extent_buffer(leaf, (unsigned long)item,
3251                                         sizeof(struct btrfs_item),
3252                                         &leaf->map_token, &leaf->kaddr,
3253                                         &leaf->map_start, &leaf->map_len,
3254                                         KM_USER1);
3255                 }
3256
3257                 ioff = btrfs_item_offset(leaf, item);
3258                 btrfs_set_item_offset(leaf, item, ioff + size_diff);
3259         }
3260
3261         if (leaf->map_token) {
3262                 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3263                 leaf->map_token = NULL;
3264         }
3265
3266         /* shift the data */
3267         if (from_end) {
3268                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3269                               data_end + size_diff, btrfs_leaf_data(leaf) +
3270                               data_end, old_data_start + new_size - data_end);
3271         } else {
3272                 struct btrfs_disk_key disk_key;
3273                 u64 offset;
3274
3275                 btrfs_item_key(leaf, &disk_key, slot);
3276
3277                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3278                         unsigned long ptr;
3279                         struct btrfs_file_extent_item *fi;
3280
3281                         fi = btrfs_item_ptr(leaf, slot,
3282                                             struct btrfs_file_extent_item);
3283                         fi = (struct btrfs_file_extent_item *)(
3284                              (unsigned long)fi - size_diff);
3285
3286                         if (btrfs_file_extent_type(leaf, fi) ==
3287                             BTRFS_FILE_EXTENT_INLINE) {
3288                                 ptr = btrfs_item_ptr_offset(leaf, slot);
3289                                 memmove_extent_buffer(leaf, ptr,
3290                                       (unsigned long)fi,
3291                                       offsetof(struct btrfs_file_extent_item,
3292                                                  disk_bytenr));
3293                         }
3294                 }
3295
3296                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3297                               data_end + size_diff, btrfs_leaf_data(leaf) +
3298                               data_end, old_data_start - data_end);
3299
3300                 offset = btrfs_disk_key_offset(&disk_key);
3301                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3302                 btrfs_set_item_key(leaf, &disk_key, slot);
3303                 if (slot == 0)
3304                         fixup_low_keys(trans, root, path, &disk_key, 1);
3305         }
3306
3307         item = btrfs_item_nr(leaf, slot);
3308         btrfs_set_item_size(leaf, item, new_size);
3309         btrfs_mark_buffer_dirty(leaf);
3310
3311         if (btrfs_leaf_free_space(root, leaf) < 0) {
3312                 btrfs_print_leaf(root, leaf);
3313                 BUG();
3314         }
3315         return 0;
3316 }
3317
3318 /*
3319  * make the item pointed to by the path bigger, data_size is the new size.
3320  */
3321 int btrfs_extend_item(struct btrfs_trans_handle *trans,
3322                       struct btrfs_root *root, struct btrfs_path *path,
3323                       u32 data_size)
3324 {
3325         int slot;
3326         struct extent_buffer *leaf;
3327         struct btrfs_item *item;
3328         u32 nritems;
3329         unsigned int data_end;
3330         unsigned int old_data;
3331         unsigned int old_size;
3332         int i;
3333
3334         leaf = path->nodes[0];
3335
3336         nritems = btrfs_header_nritems(leaf);
3337         data_end = leaf_data_end(root, leaf);
3338
3339         if (btrfs_leaf_free_space(root, leaf) < data_size) {
3340                 btrfs_print_leaf(root, leaf);
3341                 BUG();
3342         }
3343         slot = path->slots[0];
3344         old_data = btrfs_item_end_nr(leaf, slot);
3345
3346         BUG_ON(slot < 0);
3347         if (slot >= nritems) {
3348                 btrfs_print_leaf(root, leaf);
3349                 printk(KERN_CRIT "slot %d too large, nritems %d\n",
3350                        slot, nritems);
3351                 BUG_ON(1);
3352         }
3353
3354         /*
3355          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3356          */
3357         /* first correct the data pointers */
3358         for (i = slot; i < nritems; i++) {
3359                 u32 ioff;
3360                 item = btrfs_item_nr(leaf, i);
3361
3362                 if (!leaf->map_token) {
3363                         map_extent_buffer(leaf, (unsigned long)item,
3364                                         sizeof(struct btrfs_item),
3365                                         &leaf->map_token, &leaf->kaddr,
3366                                         &leaf->map_start, &leaf->map_len,
3367                                         KM_USER1);
3368                 }
3369                 ioff = btrfs_item_offset(leaf, item);
3370                 btrfs_set_item_offset(leaf, item, ioff - data_size);
3371         }
3372
3373         if (leaf->map_token) {
3374                 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3375                 leaf->map_token = NULL;
3376         }
3377
3378         /* shift the data */
3379         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3380                       data_end - data_size, btrfs_leaf_data(leaf) +
3381                       data_end, old_data - data_end);
3382
3383         data_end = old_data;
3384         old_size = btrfs_item_size_nr(leaf, slot);
3385         item = btrfs_item_nr(leaf, slot);
3386         btrfs_set_item_size(leaf, item, old_size + data_size);
3387         btrfs_mark_buffer_dirty(leaf);
3388
3389         if (btrfs_leaf_free_space(root, leaf) < 0) {
3390                 btrfs_print_leaf(root, leaf);
3391                 BUG();
3392         }
3393         return 0;
3394 }
3395
3396 /*
3397  * Given a key and some data, insert items into the tree.
3398  * This does all the path init required, making room in the tree if needed.
3399  * Returns the number of keys that were inserted.
3400  */
3401 int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3402                             struct btrfs_root *root,
3403                             struct btrfs_path *path,
3404                             struct btrfs_key *cpu_key, u32 *data_size,
3405                             int nr)
3406 {
3407         struct extent_buffer *leaf;
3408         struct btrfs_item *item;
3409         int ret = 0;
3410         int slot;
3411         int i;
3412         u32 nritems;
3413         u32 total_data = 0;
3414         u32 total_size = 0;
3415         unsigned int data_end;
3416         struct btrfs_disk_key disk_key;
3417         struct btrfs_key found_key;
3418
3419         for (i = 0; i < nr; i++) {
3420                 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3421                     BTRFS_LEAF_DATA_SIZE(root)) {
3422                         break;
3423                         nr = i;
3424                 }
3425                 total_data += data_size[i];
3426                 total_size += data_size[i] + sizeof(struct btrfs_item);
3427         }
3428         BUG_ON(nr == 0);
3429
3430         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3431         if (ret == 0)
3432                 return -EEXIST;
3433         if (ret < 0)
3434                 goto out;
3435
3436         leaf = path->nodes[0];
3437
3438         nritems = btrfs_header_nritems(leaf);
3439         data_end = leaf_data_end(root, leaf);
3440
3441         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3442                 for (i = nr; i >= 0; i--) {
3443                         total_data -= data_size[i];
3444                         total_size -= data_size[i] + sizeof(struct btrfs_item);
3445                         if (total_size < btrfs_leaf_free_space(root, leaf))
3446                                 break;
3447                 }
3448                 nr = i;
3449         }
3450
3451         slot = path->slots[0];
3452         BUG_ON(slot < 0);
3453
3454         if (slot != nritems) {
3455                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3456
3457                 item = btrfs_item_nr(leaf, slot);
3458                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3459
3460                 /* figure out how many keys we can insert in here */
3461                 total_data = data_size[0];
3462                 for (i = 1; i < nr; i++) {
3463                         if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3464                                 break;
3465                         total_data += data_size[i];
3466                 }
3467                 nr = i;
3468
3469                 if (old_data < data_end) {
3470                         btrfs_print_leaf(root, leaf);
3471                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3472                                slot, old_data, data_end);
3473                         BUG_ON(1);
3474                 }
3475                 /*
3476                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3477                  */
3478                 /* first correct the data pointers */
3479                 WARN_ON(leaf->map_token);
3480                 for (i = slot; i < nritems; i++) {
3481                         u32 ioff;
3482
3483                         item = btrfs_item_nr(leaf, i);
3484                         if (!leaf->map_token) {
3485                                 map_extent_buffer(leaf, (unsigned long)item,
3486                                         sizeof(struct btrfs_item),
3487                                         &leaf->map_token, &leaf->kaddr,
3488                                         &leaf->map_start, &leaf->map_len,
3489                                         KM_USER1);
3490                         }
3491
3492                         ioff = btrfs_item_offset(leaf, item);
3493                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3494                 }
3495                 if (leaf->map_token) {
3496                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3497                         leaf->map_token = NULL;
3498                 }
3499
3500                 /* shift the items */
3501                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3502                               btrfs_item_nr_offset(slot),
3503                               (nritems - slot) * sizeof(struct btrfs_item));
3504
3505                 /* shift the data */
3506                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3507                               data_end - total_data, btrfs_leaf_data(leaf) +
3508                               data_end, old_data - data_end);
3509                 data_end = old_data;
3510         } else {
3511                 /*
3512                  * this sucks but it has to be done, if we are inserting at
3513                  * the end of the leaf only insert 1 of the items, since we
3514                  * have no way of knowing whats on the next leaf and we'd have
3515                  * to drop our current locks to figure it out
3516                  */
3517                 nr = 1;
3518         }
3519
3520         /* setup the item for the new data */
3521         for (i = 0; i < nr; i++) {
3522                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3523                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3524                 item = btrfs_item_nr(leaf, slot + i);
3525                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3526                 data_end -= data_size[i];
3527                 btrfs_set_item_size(leaf, item, data_size[i]);
3528         }
3529         btrfs_set_header_nritems(leaf, nritems + nr);
3530         btrfs_mark_buffer_dirty(leaf);
3531
3532         ret = 0;
3533         if (slot == 0) {
3534                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3535                 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3536         }
3537
3538         if (btrfs_leaf_free_space(root, leaf) < 0) {
3539                 btrfs_print_leaf(root, leaf);
3540                 BUG();
3541         }
3542 out:
3543         if (!ret)
3544                 ret = nr;
3545         return ret;
3546 }
3547
3548 /*
3549  * this is a helper for btrfs_insert_empty_items, the main goal here is
3550  * to save stack depth by doing the bulk of the work in a function
3551  * that doesn't call btrfs_search_slot
3552  */
3553 int setup_items_for_insert(struct btrfs_trans_handle *trans,
3554                            struct btrfs_root *root, struct btrfs_path *path,
3555                            struct btrfs_key *cpu_key, u32 *data_size,
3556                            u32 total_data, u32 total_size, int nr)
3557 {
3558         struct btrfs_item *item;
3559         int i;
3560         u32 nritems;
3561         unsigned int data_end;
3562         struct btrfs_disk_key disk_key;
3563         int ret;
3564         struct extent_buffer *leaf;
3565         int slot;
3566
3567         leaf = path->nodes[0];
3568         slot = path->slots[0];
3569
3570         nritems = btrfs_header_nritems(leaf);
3571         data_end = leaf_data_end(root, leaf);
3572
3573         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3574                 btrfs_print_leaf(root, leaf);
3575                 printk(KERN_CRIT "not enough freespace need %u have %d\n",
3576                        total_size, btrfs_leaf_free_space(root, leaf));
3577                 BUG();
3578         }
3579
3580         if (slot != nritems) {
3581                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3582
3583                 if (old_data < data_end) {
3584                         btrfs_print_leaf(root, leaf);
3585                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3586                                slot, old_data, data_end);
3587                         BUG_ON(1);
3588                 }
3589                 /*
3590                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3591                  */
3592                 /* first correct the data pointers */
3593                 WARN_ON(leaf->map_token);
3594                 for (i = slot; i < nritems; i++) {
3595                         u32 ioff;
3596
3597                         item = btrfs_item_nr(leaf, i);
3598                         if (!leaf->map_token) {
3599                                 map_extent_buffer(leaf, (unsigned long)item,
3600                                         sizeof(struct btrfs_item),
3601                                         &leaf->map_token, &leaf->kaddr,
3602                                         &leaf->map_start, &leaf->map_len,
3603                                         KM_USER1);
3604                         }
3605
3606                         ioff = btrfs_item_offset(leaf, item);
3607                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3608                 }
3609                 if (leaf->map_token) {
3610                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3611                         leaf->map_token = NULL;
3612                 }
3613
3614                 /* shift the items */
3615                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3616                               btrfs_item_nr_offset(slot),
3617                               (nritems - slot) * sizeof(struct btrfs_item));
3618
3619                 /* shift the data */
3620                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3621                               data_end - total_data, btrfs_leaf_data(leaf) +
3622                               data_end, old_data - data_end);
3623                 data_end = old_data;
3624         }
3625
3626         /* setup the item for the new data */
3627         for (i = 0; i < nr; i++) {
3628                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3629                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3630                 item = btrfs_item_nr(leaf, slot + i);
3631                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3632                 data_end -= data_size[i];
3633                 btrfs_set_item_size(leaf, item, data_size[i]);
3634         }
3635
3636         btrfs_set_header_nritems(leaf, nritems + nr);
3637
3638         ret = 0;
3639         if (slot == 0) {
3640                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3641                 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3642         }
3643         btrfs_unlock_up_safe(path, 1);
3644         btrfs_mark_buffer_dirty(leaf);
3645
3646         if (btrfs_leaf_free_space(root, leaf) < 0) {
3647                 btrfs_print_leaf(root, leaf);
3648                 BUG();
3649         }
3650         return ret;
3651 }
3652
3653 /*
3654  * Given a key and some data, insert items into the tree.
3655  * This does all the path init required, making room in the tree if needed.
3656  */
3657 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3658                             struct btrfs_root *root,
3659                             struct btrfs_path *path,
3660                             struct btrfs_key *cpu_key, u32 *data_size,
3661                             int nr)
3662 {
3663         int ret = 0;
3664         int slot;
3665         int i;
3666         u32 total_size = 0;
3667         u32 total_data = 0;
3668
3669         for (i = 0; i < nr; i++)
3670                 total_data += data_size[i];
3671
3672         total_size = total_data + (nr * sizeof(struct btrfs_item));
3673         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3674         if (ret == 0)
3675                 return -EEXIST;
3676         if (ret < 0)
3677                 goto out;
3678
3679         slot = path->slots[0];
3680         BUG_ON(slot < 0);
3681
3682         ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
3683                                total_data, total_size, nr);
3684
3685 out:
3686         return ret;
3687 }
3688
3689 /*
3690  * Given a key and some data, insert an item into the tree.
3691  * This does all the path init required, making room in the tree if needed.
3692  */
3693 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3694                       *root, struct btrfs_key *cpu_key, void *data, u32
3695                       data_size)
3696 {
3697         int ret = 0;
3698         struct btrfs_path *path;
3699         struct extent_buffer *leaf;
3700         unsigned long ptr;
3701
3702         path = btrfs_alloc_path();
3703         if (!path)
3704                 return -ENOMEM;
3705         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3706         if (!ret) {
3707                 leaf = path->nodes[0];
3708                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3709                 write_extent_buffer(leaf, data, ptr, data_size);
3710                 btrfs_mark_buffer_dirty(leaf);
3711         }
3712         btrfs_free_path(path);
3713         return ret;
3714 }
3715
3716 /*
3717  * delete the pointer from a given node.
3718  *
3719  * the tree should have been previously balanced so the deletion does not
3720  * empty a node.
3721  */
3722 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3723                    struct btrfs_path *path, int level, int slot)
3724 {
3725         struct extent_buffer *parent = path->nodes[level];
3726         u32 nritems;
3727         int ret = 0;
3728         int wret;
3729
3730         nritems = btrfs_header_nritems(parent);
3731         if (slot != nritems - 1) {
3732                 memmove_extent_buffer(parent,
3733                               btrfs_node_key_ptr_offset(slot),
3734                               btrfs_node_key_ptr_offset(slot + 1),
3735                               sizeof(struct btrfs_key_ptr) *
3736                               (nritems - slot - 1));
3737         }
3738         nritems--;
3739         btrfs_set_header_nritems(parent, nritems);
3740         if (nritems == 0 && parent == root->node) {
3741                 BUG_ON(btrfs_header_level(root->node) != 1);
3742                 /* just turn the root into a leaf and break */
3743                 btrfs_set_header_level(root->node, 0);
3744         } else if (slot == 0) {
3745                 struct btrfs_disk_key disk_key;
3746
3747                 btrfs_node_key(parent, &disk_key, 0);
3748                 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3749                 if (wret)
3750                         ret = wret;
3751         }
3752         btrfs_mark_buffer_dirty(parent);
3753         return ret;
3754 }
3755
3756 /*
3757  * a helper function to delete the leaf pointed to by path->slots[1] and
3758  * path->nodes[1].
3759  *
3760  * This deletes the pointer in path->nodes[1] and frees the leaf
3761  * block extent.  zero is returned if it all worked out, < 0 otherwise.
3762  *
3763  * The path must have already been setup for deleting the leaf, including
3764  * all the proper balancing.  path->nodes[1] must be locked.
3765  */
3766 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3767                                    struct btrfs_root *root,
3768                                    struct btrfs_path *path,
3769                                    struct extent_buffer *leaf)
3770 {
3771         int ret;
3772
3773         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3774         ret = del_ptr(trans, root, path, 1, path->slots[1]);
3775         if (ret)
3776                 return ret;
3777
3778         /*
3779          * btrfs_free_extent is expensive, we want to make sure we
3780          * aren't holding any locks when we call it
3781          */
3782         btrfs_unlock_up_safe(path, 0);
3783
3784         root_sub_used(root, leaf->len);
3785
3786         btrfs_free_tree_block(trans, root, leaf, 0, 1);
3787         return 0;
3788 }
3789 /*
3790  * delete the item at the leaf level in path.  If that empties
3791  * the leaf, remove it from the tree
3792  */
3793 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3794                     struct btrfs_path *path, int slot, int nr)
3795 {
3796         struct extent_buffer *leaf;
3797         struct btrfs_item *item;
3798         int last_off;
3799         int dsize = 0;
3800         int ret = 0;
3801         int wret;
3802         int i;
3803         u32 nritems;
3804
3805         leaf = path->nodes[0];
3806         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3807
3808         for (i = 0; i < nr; i++)
3809                 dsize += btrfs_item_size_nr(leaf, slot + i);
3810
3811         nritems = btrfs_header_nritems(leaf);
3812
3813         if (slot + nr != nritems) {
3814                 int data_end = leaf_data_end(root, leaf);
3815
3816                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3817                               data_end + dsize,
3818                               btrfs_leaf_data(leaf) + data_end,
3819                               last_off - data_end);
3820
3821                 for (i = slot + nr; i < nritems; i++) {
3822                         u32 ioff;
3823
3824                         item = btrfs_item_nr(leaf, i);
3825                         if (!leaf->map_token) {
3826                                 map_extent_buffer(leaf, (unsigned long)item,
3827                                         sizeof(struct btrfs_item),
3828                                         &leaf->map_token, &leaf->kaddr,
3829                                         &leaf->map_start, &leaf->map_len,
3830                                         KM_USER1);
3831                         }
3832                         ioff = btrfs_item_offset(leaf, item);
3833                         btrfs_set_item_offset(leaf, item, ioff + dsize);
3834                 }
3835
3836                 if (leaf->map_token) {
3837                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3838                         leaf->map_token = NULL;
3839                 }
3840
3841                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3842                               btrfs_item_nr_offset(slot + nr),
3843                               sizeof(struct btrfs_item) *
3844                               (nritems - slot - nr));
3845         }
3846         btrfs_set_header_nritems(leaf, nritems - nr);
3847         nritems -= nr;
3848
3849         /* delete the leaf if we've emptied it */
3850         if (nritems == 0) {
3851                 if (leaf == root->node) {
3852                         btrfs_set_header_level(leaf, 0);
3853                 } else {
3854                         btrfs_set_path_blocking(path);
3855                         clean_tree_block(trans, root, leaf);
3856                         ret = btrfs_del_leaf(trans, root, path, leaf);
3857                         BUG_ON(ret);
3858                 }
3859         } else {
3860                 int used = leaf_space_used(leaf, 0, nritems);
3861                 if (slot == 0) {
3862                         struct btrfs_disk_key disk_key;
3863
3864                         btrfs_item_key(leaf, &disk_key, 0);
3865                         wret = fixup_low_keys(trans, root, path,
3866                                               &disk_key, 1);
3867                         if (wret)
3868                                 ret = wret;
3869                 }
3870
3871                 /* delete the leaf if it is mostly empty */
3872                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
3873                         /* push_leaf_left fixes the path.
3874                          * make sure the path still points to our leaf
3875                          * for possible call to del_ptr below
3876                          */
3877                         slot = path->slots[1];
3878                         extent_buffer_get(leaf);
3879
3880                         btrfs_set_path_blocking(path);
3881                         wret = push_leaf_left(trans, root, path, 1, 1,
3882                                               1, (u32)-1);
3883                         if (wret < 0 && wret != -ENOSPC)
3884                                 ret = wret;
3885
3886                         if (path->nodes[0] == leaf &&
3887                             btrfs_header_nritems(leaf)) {
3888                                 wret = push_leaf_right(trans, root, path, 1,
3889                                                        1, 1, 0);
3890                                 if (wret < 0 && wret != -ENOSPC)
3891                                         ret = wret;
3892                         }
3893
3894                         if (btrfs_header_nritems(leaf) == 0) {
3895                                 path->slots[1] = slot;
3896                                 ret = btrfs_del_leaf(trans, root, path, leaf);
3897                                 BUG_ON(ret);
3898                                 free_extent_buffer(leaf);
3899                         } else {
3900                                 /* if we're still in the path, make sure
3901                                  * we're dirty.  Otherwise, one of the
3902                                  * push_leaf functions must have already
3903                                  * dirtied this buffer
3904                                  */
3905                                 if (path->nodes[0] == leaf)
3906                                         btrfs_mark_buffer_dirty(leaf);
3907                                 free_extent_buffer(leaf);
3908                         }
3909                 } else {
3910                         btrfs_mark_buffer_dirty(leaf);
3911                 }
3912         }
3913         return ret;
3914 }
3915
3916 /*
3917  * search the tree again to find a leaf with lesser keys
3918  * returns 0 if it found something or 1 if there are no lesser leaves.
3919  * returns < 0 on io errors.
3920  *
3921  * This may release the path, and so you may lose any locks held at the
3922  * time you call it.
3923  */
3924 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3925 {
3926         struct btrfs_key key;
3927         struct btrfs_disk_key found_key;
3928         int ret;
3929
3930         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3931
3932         if (key.offset > 0)
3933                 key.offset--;
3934         else if (key.type > 0)
3935                 key.type--;
3936         else if (key.objectid > 0)
3937                 key.objectid--;
3938         else
3939                 return 1;
3940
3941         btrfs_release_path(path);
3942         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3943         if (ret < 0)
3944                 return ret;
3945         btrfs_item_key(path->nodes[0], &found_key, 0);
3946         ret = comp_keys(&found_key, &key);
3947         if (ret < 0)
3948                 return 0;
3949         return 1;
3950 }
3951
3952 /*
3953  * A helper function to walk down the tree starting at min_key, and looking
3954  * for nodes or leaves that are either in cache or have a minimum
3955  * transaction id.  This is used by the btree defrag code, and tree logging
3956  *
3957  * This does not cow, but it does stuff the starting key it finds back
3958  * into min_key, so you can call btrfs_search_slot with cow=1 on the
3959  * key and get a writable path.
3960  *
3961  * This does lock as it descends, and path->keep_locks should be set
3962  * to 1 by the caller.
3963  *
3964  * This honors path->lowest_level to prevent descent past a given level
3965  * of the tree.
3966  *
3967  * min_trans indicates the oldest transaction that you are interested
3968  * in walking through.  Any nodes or leaves older than min_trans are
3969  * skipped over (without reading them).
3970  *
3971  * returns zero if something useful was found, < 0 on error and 1 if there
3972  * was nothing in the tree that matched the search criteria.
3973  */
3974 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3975                          struct btrfs_key *max_key,
3976                          struct btrfs_path *path, int cache_only,
3977                          u64 min_trans)
3978 {
3979         struct extent_buffer *cur;
3980         struct btrfs_key found_key;
3981         int slot;
3982         int sret;
3983         u32 nritems;
3984         int level;
3985         int ret = 1;
3986
3987         WARN_ON(!path->keep_locks);
3988 again:
3989         cur = btrfs_lock_root_node(root);
3990         level = btrfs_header_level(cur);
3991         WARN_ON(path->nodes[level]);
3992         path->nodes[level] = cur;
3993         path->locks[level] = 1;
3994
3995         if (btrfs_header_generation(cur) < min_trans) {
3996                 ret = 1;
3997                 goto out;
3998         }
3999         while (1) {
4000                 nritems = btrfs_header_nritems(cur);
4001                 level = btrfs_header_level(cur);
4002                 sret = bin_search(cur, min_key, level, &slot);
4003
4004                 /* at the lowest level, we're done, setup the path and exit */
4005                 if (level == path->lowest_level) {
4006                         if (slot >= nritems)
4007                                 goto find_next_key;
4008                         ret = 0;
4009                         path->slots[level] = slot;
4010                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4011                         goto out;
4012                 }
4013                 if (sret && slot > 0)
4014                         slot--;
4015                 /*
4016                  * check this node pointer against the cache_only and
4017                  * min_trans parameters.  If it isn't in cache or is too
4018                  * old, skip to the next one.
4019                  */
4020                 while (slot < nritems) {
4021                         u64 blockptr;
4022                         u64 gen;
4023                         struct extent_buffer *tmp;
4024                         struct btrfs_disk_key disk_key;
4025
4026                         blockptr = btrfs_node_blockptr(cur, slot);
4027                         gen = btrfs_node_ptr_generation(cur, slot);
4028                         if (gen < min_trans) {
4029                                 slot++;
4030                                 continue;
4031                         }
4032                         if (!cache_only)
4033                                 break;
4034
4035                         if (max_key) {
4036                                 btrfs_node_key(cur, &disk_key, slot);
4037                                 if (comp_keys(&disk_key, max_key) >= 0) {
4038                                         ret = 1;
4039                                         goto out;
4040                                 }
4041                         }
4042
4043                         tmp = btrfs_find_tree_block(root, blockptr,
4044                                             btrfs_level_size(root, level - 1));
4045
4046                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
4047                                 free_extent_buffer(tmp);
4048                                 break;
4049                         }
4050                         if (tmp)
4051                                 free_extent_buffer(tmp);
4052                         slot++;
4053                 }
4054 find_next_key:
4055                 /*
4056                  * we didn't find a candidate key in this node, walk forward
4057                  * and find another one
4058                  */
4059                 if (slot >= nritems) {
4060                         path->slots[level] = slot;
4061                         btrfs_set_path_blocking(path);
4062                         sret = btrfs_find_next_key(root, path, min_key, level,
4063                                                   cache_only, min_trans);
4064                         if (sret == 0) {
4065                                 btrfs_release_path(path);
4066                                 goto again;
4067                         } else {
4068                                 goto out;
4069                         }
4070                 }
4071                 /* save our key for returning back */
4072                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4073                 path->slots[level] = slot;
4074                 if (level == path->lowest_level) {
4075                         ret = 0;
4076                         unlock_up(path, level, 1);
4077                         goto out;
4078                 }
4079                 btrfs_set_path_blocking(path);
4080                 cur = read_node_slot(root, cur, slot);
4081                 BUG_ON(!cur);
4082
4083                 btrfs_tree_lock(cur);
4084
4085                 path->locks[level - 1] = 1;
4086                 path->nodes[level - 1] = cur;
4087                 unlock_up(path, level, 1);
4088                 btrfs_clear_path_blocking(path, NULL);
4089         }
4090 out:
4091         if (ret == 0)
4092                 memcpy(min_key, &found_key, sizeof(found_key));
4093         btrfs_set_path_blocking(path);
4094         return ret;
4095 }
4096
4097 /*
4098  * this is similar to btrfs_next_leaf, but does not try to preserve
4099  * and fixup the path.  It looks for and returns the next key in the
4100  * tree based on the current path and the cache_only and min_trans
4101  * parameters.
4102  *
4103  * 0 is returned if another key is found, < 0 if there are any errors
4104  * and 1 is returned if there are no higher keys in the tree
4105  *
4106  * path->keep_locks should be set to 1 on the search made before
4107  * calling this function.
4108  */
4109 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4110                         struct btrfs_key *key, int level,
4111                         int cache_only, u64 min_trans)
4112 {
4113         int slot;
4114         struct extent_buffer *c;
4115
4116         WARN_ON(!path->keep_locks);
4117         while (level < BTRFS_MAX_LEVEL) {
4118                 if (!path->nodes[level])
4119                         return 1;
4120
4121                 slot = path->slots[level] + 1;
4122                 c = path->nodes[level];
4123 next:
4124                 if (slot >= btrfs_header_nritems(c)) {
4125                         int ret;
4126                         int orig_lowest;
4127                         struct btrfs_key cur_key;
4128                         if (level + 1 >= BTRFS_MAX_LEVEL ||
4129                             !path->nodes[level + 1])
4130                                 return 1;
4131
4132                         if (path->locks[level + 1]) {
4133                                 level++;
4134                                 continue;
4135                         }
4136
4137                         slot = btrfs_header_nritems(c) - 1;
4138                         if (level == 0)
4139                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
4140                         else
4141                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
4142
4143                         orig_lowest = path->lowest_level;
4144                         btrfs_release_path(path);
4145                         path->lowest_level = level;
4146                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
4147                                                 0, 0);
4148                         path->lowest_level = orig_lowest;
4149                         if (ret < 0)
4150                                 return ret;
4151
4152                         c = path->nodes[level];
4153                         slot = path->slots[level];
4154                         if (ret == 0)
4155                                 slot++;
4156                         goto next;
4157                 }
4158
4159                 if (level == 0)
4160                         btrfs_item_key_to_cpu(c, key, slot);
4161                 else {
4162                         u64 blockptr = btrfs_node_blockptr(c, slot);
4163                         u64 gen = btrfs_node_ptr_generation(c, slot);
4164
4165                         if (cache_only) {
4166                                 struct extent_buffer *cur;
4167                                 cur = btrfs_find_tree_block(root, blockptr,
4168                                             btrfs_level_size(root, level - 1));
4169                                 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
4170                                         slot++;
4171                                         if (cur)
4172                                                 free_extent_buffer(cur);
4173                                         goto next;
4174                                 }
4175                                 free_extent_buffer(cur);
4176                         }
4177                         if (gen < min_trans) {
4178                                 slot++;
4179                                 goto next;
4180                         }
4181                         btrfs_node_key_to_cpu(c, key, slot);
4182                 }
4183                 return 0;
4184         }
4185         return 1;
4186 }
4187
4188 /*
4189  * search the tree again to find a leaf with greater keys
4190  * returns 0 if it found something or 1 if there are no greater leaves.
4191  * returns < 0 on io errors.
4192  */
4193 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4194 {
4195         int slot;
4196         int level;
4197         struct extent_buffer *c;
4198         struct extent_buffer *next;
4199         struct btrfs_key key;
4200         u32 nritems;
4201         int ret;
4202         int old_spinning = path->leave_spinning;
4203         int force_blocking = 0;
4204
4205         nritems = btrfs_header_nritems(path->nodes[0]);
4206         if (nritems == 0)
4207                 return 1;
4208
4209         /*
4210          * we take the blocks in an order that upsets lockdep.  Using
4211          * blocking mode is the only way around it.
4212          */
4213 #ifdef CONFIG_DEBUG_LOCK_ALLOC
4214         force_blocking = 1;
4215 #endif
4216
4217         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4218 again:
4219         level = 1;
4220         next = NULL;
4221         btrfs_release_path(path);
4222
4223         path->keep_locks = 1;
4224
4225         if (!force_blocking)
4226                 path->leave_spinning = 1;
4227
4228         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4229         path->keep_locks = 0;
4230
4231         if (ret < 0)
4232                 return ret;
4233
4234         nritems = btrfs_header_nritems(path->nodes[0]);
4235         /*
4236          * by releasing the path above we dropped all our locks.  A balance
4237          * could have added more items next to the key that used to be
4238          * at the very end of the block.  So, check again here and
4239          * advance the path if there are now more items available.
4240          */
4241         if (nritems > 0 && path->slots[0] < nritems - 1) {
4242                 if (ret == 0)
4243                         path->slots[0]++;
4244                 ret = 0;
4245                 goto done;
4246         }
4247
4248         while (level < BTRFS_MAX_LEVEL) {
4249                 if (!path->nodes[level]) {
4250                         ret = 1;
4251                         goto done;
4252                 }
4253
4254                 slot = path->slots[level] + 1;
4255                 c = path->nodes[level];
4256                 if (slot >= btrfs_header_nritems(c)) {
4257                         level++;
4258                         if (level == BTRFS_MAX_LEVEL) {
4259                                 ret = 1;
4260                                 goto done;
4261                         }
4262                         continue;
4263                 }
4264
4265                 if (next) {
4266                         btrfs_tree_unlock(next);
4267                         free_extent_buffer(next);
4268                 }
4269
4270                 next = c;
4271                 ret = read_block_for_search(NULL, root, path, &next, level,
4272                                             slot, &key);
4273                 if (ret == -EAGAIN)
4274                         goto again;
4275
4276                 if (ret < 0) {
4277                         btrfs_release_path(path);
4278                         goto done;
4279                 }
4280
4281                 if (!path->skip_locking) {
4282                         ret = btrfs_try_spin_lock(next);
4283                         if (!ret) {
4284                                 btrfs_set_path_blocking(path);
4285                                 btrfs_tree_lock(next);
4286                                 if (!force_blocking)
4287                                         btrfs_clear_path_blocking(path, next);
4288                         }
4289                         if (force_blocking)
4290                                 btrfs_set_lock_blocking(next);
4291                 }
4292                 break;
4293         }
4294         path->slots[level] = slot;
4295         while (1) {
4296                 level--;
4297                 c = path->nodes[level];
4298                 if (path->locks[level])
4299                         btrfs_tree_unlock(c);
4300
4301                 free_extent_buffer(c);
4302                 path->nodes[level] = next;
4303                 path->slots[level] = 0;
4304                 if (!path->skip_locking)
4305                         path->locks[level] = 1;
4306
4307                 if (!level)
4308                         break;
4309
4310                 ret = read_block_for_search(NULL, root, path, &next, level,
4311                                             0, &key);
4312                 if (ret == -EAGAIN)
4313                         goto again;
4314
4315                 if (ret < 0) {
4316                         btrfs_release_path(path);
4317                         goto done;
4318                 }
4319
4320                 if (!path->skip_locking) {
4321                         btrfs_assert_tree_locked(path->nodes[level]);
4322                         ret = btrfs_try_spin_lock(next);
4323                         if (!ret) {
4324                                 btrfs_set_path_blocking(path);
4325                                 btrfs_tree_lock(next);
4326                                 if (!force_blocking)
4327                                         btrfs_clear_path_blocking(path, next);
4328                         }
4329                         if (force_blocking)
4330                                 btrfs_set_lock_blocking(next);
4331                 }
4332         }
4333         ret = 0;
4334 done:
4335         unlock_up(path, 0, 1);
4336         path->leave_spinning = old_spinning;
4337         if (!old_spinning)
4338                 btrfs_set_path_blocking(path);
4339
4340         return ret;
4341 }
4342
4343 /*
4344  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4345  * searching until it gets past min_objectid or finds an item of 'type'
4346  *
4347  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4348  */
4349 int btrfs_previous_item(struct btrfs_root *root,
4350                         struct btrfs_path *path, u64 min_objectid,
4351                         int type)
4352 {
4353         struct btrfs_key found_key;
4354         struct extent_buffer *leaf;
4355         u32 nritems;
4356         int ret;
4357
4358         while (1) {
4359                 if (path->slots[0] == 0) {
4360                         btrfs_set_path_blocking(path);
4361                         ret = btrfs_prev_leaf(root, path);
4362                         if (ret != 0)
4363                                 return ret;
4364                 } else {
4365                         path->slots[0]--;
4366                 }
4367                 leaf = path->nodes[0];
4368                 nritems = btrfs_header_nritems(leaf);
4369                 if (nritems == 0)
4370                         return 1;
4371                 if (path->slots[0] == nritems)
4372                         path->slots[0]--;
4373
4374                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4375                 if (found_key.objectid < min_objectid)
4376                         break;
4377                 if (found_key.type == type)
4378                         return 0;
4379                 if (found_key.objectid == min_objectid &&
4380                     found_key.type < type)
4381                         break;
4382         }
4383         return 1;
4384 }