Btrfs: stop using highmem for extent_buffers
[pandora-kernel.git] / fs / btrfs / extent_io.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/pagemap.h>
6 #include <linux/page-flags.h>
7 #include <linux/module.h>
8 #include <linux/spinlock.h>
9 #include <linux/blkdev.h>
10 #include <linux/swap.h>
11 #include <linux/writeback.h>
12 #include <linux/pagevec.h>
13 #include "extent_io.h"
14 #include "extent_map.h"
15 #include "compat.h"
16 #include "ctree.h"
17 #include "btrfs_inode.h"
18
19 static struct kmem_cache *extent_state_cache;
20 static struct kmem_cache *extent_buffer_cache;
21
22 static LIST_HEAD(buffers);
23 static LIST_HEAD(states);
24
25 #define LEAK_DEBUG 0
26 #if LEAK_DEBUG
27 static DEFINE_SPINLOCK(leak_lock);
28 #endif
29
30 #define BUFFER_LRU_MAX 64
31
32 struct tree_entry {
33         u64 start;
34         u64 end;
35         struct rb_node rb_node;
36 };
37
38 struct extent_page_data {
39         struct bio *bio;
40         struct extent_io_tree *tree;
41         get_extent_t *get_extent;
42
43         /* tells writepage not to lock the state bits for this range
44          * it still does the unlocking
45          */
46         unsigned int extent_locked:1;
47
48         /* tells the submit_bio code to use a WRITE_SYNC */
49         unsigned int sync_io:1;
50 };
51
52 int __init extent_io_init(void)
53 {
54         extent_state_cache = kmem_cache_create("extent_state",
55                         sizeof(struct extent_state), 0,
56                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
57         if (!extent_state_cache)
58                 return -ENOMEM;
59
60         extent_buffer_cache = kmem_cache_create("extent_buffers",
61                         sizeof(struct extent_buffer), 0,
62                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
63         if (!extent_buffer_cache)
64                 goto free_state_cache;
65         return 0;
66
67 free_state_cache:
68         kmem_cache_destroy(extent_state_cache);
69         return -ENOMEM;
70 }
71
72 void extent_io_exit(void)
73 {
74         struct extent_state *state;
75         struct extent_buffer *eb;
76
77         while (!list_empty(&states)) {
78                 state = list_entry(states.next, struct extent_state, leak_list);
79                 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
80                        "state %lu in tree %p refs %d\n",
81                        (unsigned long long)state->start,
82                        (unsigned long long)state->end,
83                        state->state, state->tree, atomic_read(&state->refs));
84                 list_del(&state->leak_list);
85                 kmem_cache_free(extent_state_cache, state);
86
87         }
88
89         while (!list_empty(&buffers)) {
90                 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
91                 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
92                        "refs %d\n", (unsigned long long)eb->start,
93                        eb->len, atomic_read(&eb->refs));
94                 list_del(&eb->leak_list);
95                 kmem_cache_free(extent_buffer_cache, eb);
96         }
97         if (extent_state_cache)
98                 kmem_cache_destroy(extent_state_cache);
99         if (extent_buffer_cache)
100                 kmem_cache_destroy(extent_buffer_cache);
101 }
102
103 void extent_io_tree_init(struct extent_io_tree *tree,
104                          struct address_space *mapping)
105 {
106         tree->state = RB_ROOT;
107         INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
108         tree->ops = NULL;
109         tree->dirty_bytes = 0;
110         spin_lock_init(&tree->lock);
111         spin_lock_init(&tree->buffer_lock);
112         tree->mapping = mapping;
113 }
114
115 static struct extent_state *alloc_extent_state(gfp_t mask)
116 {
117         struct extent_state *state;
118 #if LEAK_DEBUG
119         unsigned long flags;
120 #endif
121
122         state = kmem_cache_alloc(extent_state_cache, mask);
123         if (!state)
124                 return state;
125         state->state = 0;
126         state->private = 0;
127         state->tree = NULL;
128 #if LEAK_DEBUG
129         spin_lock_irqsave(&leak_lock, flags);
130         list_add(&state->leak_list, &states);
131         spin_unlock_irqrestore(&leak_lock, flags);
132 #endif
133         atomic_set(&state->refs, 1);
134         init_waitqueue_head(&state->wq);
135         return state;
136 }
137
138 void free_extent_state(struct extent_state *state)
139 {
140         if (!state)
141                 return;
142         if (atomic_dec_and_test(&state->refs)) {
143 #if LEAK_DEBUG
144                 unsigned long flags;
145 #endif
146                 WARN_ON(state->tree);
147 #if LEAK_DEBUG
148                 spin_lock_irqsave(&leak_lock, flags);
149                 list_del(&state->leak_list);
150                 spin_unlock_irqrestore(&leak_lock, flags);
151 #endif
152                 kmem_cache_free(extent_state_cache, state);
153         }
154 }
155
156 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
157                                    struct rb_node *node)
158 {
159         struct rb_node **p = &root->rb_node;
160         struct rb_node *parent = NULL;
161         struct tree_entry *entry;
162
163         while (*p) {
164                 parent = *p;
165                 entry = rb_entry(parent, struct tree_entry, rb_node);
166
167                 if (offset < entry->start)
168                         p = &(*p)->rb_left;
169                 else if (offset > entry->end)
170                         p = &(*p)->rb_right;
171                 else
172                         return parent;
173         }
174
175         entry = rb_entry(node, struct tree_entry, rb_node);
176         rb_link_node(node, parent, p);
177         rb_insert_color(node, root);
178         return NULL;
179 }
180
181 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
182                                      struct rb_node **prev_ret,
183                                      struct rb_node **next_ret)
184 {
185         struct rb_root *root = &tree->state;
186         struct rb_node *n = root->rb_node;
187         struct rb_node *prev = NULL;
188         struct rb_node *orig_prev = NULL;
189         struct tree_entry *entry;
190         struct tree_entry *prev_entry = NULL;
191
192         while (n) {
193                 entry = rb_entry(n, struct tree_entry, rb_node);
194                 prev = n;
195                 prev_entry = entry;
196
197                 if (offset < entry->start)
198                         n = n->rb_left;
199                 else if (offset > entry->end)
200                         n = n->rb_right;
201                 else
202                         return n;
203         }
204
205         if (prev_ret) {
206                 orig_prev = prev;
207                 while (prev && offset > prev_entry->end) {
208                         prev = rb_next(prev);
209                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
210                 }
211                 *prev_ret = prev;
212                 prev = orig_prev;
213         }
214
215         if (next_ret) {
216                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
217                 while (prev && offset < prev_entry->start) {
218                         prev = rb_prev(prev);
219                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
220                 }
221                 *next_ret = prev;
222         }
223         return NULL;
224 }
225
226 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
227                                           u64 offset)
228 {
229         struct rb_node *prev = NULL;
230         struct rb_node *ret;
231
232         ret = __etree_search(tree, offset, &prev, NULL);
233         if (!ret)
234                 return prev;
235         return ret;
236 }
237
238 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
239                      struct extent_state *other)
240 {
241         if (tree->ops && tree->ops->merge_extent_hook)
242                 tree->ops->merge_extent_hook(tree->mapping->host, new,
243                                              other);
244 }
245
246 /*
247  * utility function to look for merge candidates inside a given range.
248  * Any extents with matching state are merged together into a single
249  * extent in the tree.  Extents with EXTENT_IO in their state field
250  * are not merged because the end_io handlers need to be able to do
251  * operations on them without sleeping (or doing allocations/splits).
252  *
253  * This should be called with the tree lock held.
254  */
255 static int merge_state(struct extent_io_tree *tree,
256                        struct extent_state *state)
257 {
258         struct extent_state *other;
259         struct rb_node *other_node;
260
261         if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
262                 return 0;
263
264         other_node = rb_prev(&state->rb_node);
265         if (other_node) {
266                 other = rb_entry(other_node, struct extent_state, rb_node);
267                 if (other->end == state->start - 1 &&
268                     other->state == state->state) {
269                         merge_cb(tree, state, other);
270                         state->start = other->start;
271                         other->tree = NULL;
272                         rb_erase(&other->rb_node, &tree->state);
273                         free_extent_state(other);
274                 }
275         }
276         other_node = rb_next(&state->rb_node);
277         if (other_node) {
278                 other = rb_entry(other_node, struct extent_state, rb_node);
279                 if (other->start == state->end + 1 &&
280                     other->state == state->state) {
281                         merge_cb(tree, state, other);
282                         state->end = other->end;
283                         other->tree = NULL;
284                         rb_erase(&other->rb_node, &tree->state);
285                         free_extent_state(other);
286                 }
287         }
288
289         return 0;
290 }
291
292 static int set_state_cb(struct extent_io_tree *tree,
293                          struct extent_state *state, int *bits)
294 {
295         if (tree->ops && tree->ops->set_bit_hook) {
296                 return tree->ops->set_bit_hook(tree->mapping->host,
297                                                state, bits);
298         }
299
300         return 0;
301 }
302
303 static void clear_state_cb(struct extent_io_tree *tree,
304                            struct extent_state *state, int *bits)
305 {
306         if (tree->ops && tree->ops->clear_bit_hook)
307                 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
308 }
309
310 /*
311  * insert an extent_state struct into the tree.  'bits' are set on the
312  * struct before it is inserted.
313  *
314  * This may return -EEXIST if the extent is already there, in which case the
315  * state struct is freed.
316  *
317  * The tree lock is not taken internally.  This is a utility function and
318  * probably isn't what you want to call (see set/clear_extent_bit).
319  */
320 static int insert_state(struct extent_io_tree *tree,
321                         struct extent_state *state, u64 start, u64 end,
322                         int *bits)
323 {
324         struct rb_node *node;
325         int bits_to_set = *bits & ~EXTENT_CTLBITS;
326         int ret;
327
328         if (end < start) {
329                 printk(KERN_ERR "btrfs end < start %llu %llu\n",
330                        (unsigned long long)end,
331                        (unsigned long long)start);
332                 WARN_ON(1);
333         }
334         state->start = start;
335         state->end = end;
336         ret = set_state_cb(tree, state, bits);
337         if (ret)
338                 return ret;
339
340         if (bits_to_set & EXTENT_DIRTY)
341                 tree->dirty_bytes += end - start + 1;
342         state->state |= bits_to_set;
343         node = tree_insert(&tree->state, end, &state->rb_node);
344         if (node) {
345                 struct extent_state *found;
346                 found = rb_entry(node, struct extent_state, rb_node);
347                 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
348                        "%llu %llu\n", (unsigned long long)found->start,
349                        (unsigned long long)found->end,
350                        (unsigned long long)start, (unsigned long long)end);
351                 return -EEXIST;
352         }
353         state->tree = tree;
354         merge_state(tree, state);
355         return 0;
356 }
357
358 static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
359                      u64 split)
360 {
361         if (tree->ops && tree->ops->split_extent_hook)
362                 return tree->ops->split_extent_hook(tree->mapping->host,
363                                                     orig, split);
364         return 0;
365 }
366
367 /*
368  * split a given extent state struct in two, inserting the preallocated
369  * struct 'prealloc' as the newly created second half.  'split' indicates an
370  * offset inside 'orig' where it should be split.
371  *
372  * Before calling,
373  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
374  * are two extent state structs in the tree:
375  * prealloc: [orig->start, split - 1]
376  * orig: [ split, orig->end ]
377  *
378  * The tree locks are not taken by this function. They need to be held
379  * by the caller.
380  */
381 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
382                        struct extent_state *prealloc, u64 split)
383 {
384         struct rb_node *node;
385
386         split_cb(tree, orig, split);
387
388         prealloc->start = orig->start;
389         prealloc->end = split - 1;
390         prealloc->state = orig->state;
391         orig->start = split;
392
393         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
394         if (node) {
395                 free_extent_state(prealloc);
396                 return -EEXIST;
397         }
398         prealloc->tree = tree;
399         return 0;
400 }
401
402 /*
403  * utility function to clear some bits in an extent state struct.
404  * it will optionally wake up any one waiting on this state (wake == 1), or
405  * forcibly remove the state from the tree (delete == 1).
406  *
407  * If no bits are set on the state struct after clearing things, the
408  * struct is freed and removed from the tree
409  */
410 static int clear_state_bit(struct extent_io_tree *tree,
411                             struct extent_state *state,
412                             int *bits, int wake)
413 {
414         int bits_to_clear = *bits & ~EXTENT_CTLBITS;
415         int ret = state->state & bits_to_clear;
416
417         if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
418                 u64 range = state->end - state->start + 1;
419                 WARN_ON(range > tree->dirty_bytes);
420                 tree->dirty_bytes -= range;
421         }
422         clear_state_cb(tree, state, bits);
423         state->state &= ~bits_to_clear;
424         if (wake)
425                 wake_up(&state->wq);
426         if (state->state == 0) {
427                 if (state->tree) {
428                         rb_erase(&state->rb_node, &tree->state);
429                         state->tree = NULL;
430                         free_extent_state(state);
431                 } else {
432                         WARN_ON(1);
433                 }
434         } else {
435                 merge_state(tree, state);
436         }
437         return ret;
438 }
439
440 static struct extent_state *
441 alloc_extent_state_atomic(struct extent_state *prealloc)
442 {
443         if (!prealloc)
444                 prealloc = alloc_extent_state(GFP_ATOMIC);
445
446         return prealloc;
447 }
448
449 /*
450  * clear some bits on a range in the tree.  This may require splitting
451  * or inserting elements in the tree, so the gfp mask is used to
452  * indicate which allocations or sleeping are allowed.
453  *
454  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
455  * the given range from the tree regardless of state (ie for truncate).
456  *
457  * the range [start, end] is inclusive.
458  *
459  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
460  * bits were already set, or zero if none of the bits were already set.
461  */
462 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
463                      int bits, int wake, int delete,
464                      struct extent_state **cached_state,
465                      gfp_t mask)
466 {
467         struct extent_state *state;
468         struct extent_state *cached;
469         struct extent_state *prealloc = NULL;
470         struct rb_node *next_node;
471         struct rb_node *node;
472         u64 last_end;
473         int err;
474         int set = 0;
475         int clear = 0;
476
477         if (delete)
478                 bits |= ~EXTENT_CTLBITS;
479         bits |= EXTENT_FIRST_DELALLOC;
480
481         if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
482                 clear = 1;
483 again:
484         if (!prealloc && (mask & __GFP_WAIT)) {
485                 prealloc = alloc_extent_state(mask);
486                 if (!prealloc)
487                         return -ENOMEM;
488         }
489
490         spin_lock(&tree->lock);
491         if (cached_state) {
492                 cached = *cached_state;
493
494                 if (clear) {
495                         *cached_state = NULL;
496                         cached_state = NULL;
497                 }
498
499                 if (cached && cached->tree && cached->start <= start &&
500                     cached->end > start) {
501                         if (clear)
502                                 atomic_dec(&cached->refs);
503                         state = cached;
504                         goto hit_next;
505                 }
506                 if (clear)
507                         free_extent_state(cached);
508         }
509         /*
510          * this search will find the extents that end after
511          * our range starts
512          */
513         node = tree_search(tree, start);
514         if (!node)
515                 goto out;
516         state = rb_entry(node, struct extent_state, rb_node);
517 hit_next:
518         if (state->start > end)
519                 goto out;
520         WARN_ON(state->end < start);
521         last_end = state->end;
522
523         /*
524          *     | ---- desired range ---- |
525          *  | state | or
526          *  | ------------- state -------------- |
527          *
528          * We need to split the extent we found, and may flip
529          * bits on second half.
530          *
531          * If the extent we found extends past our range, we
532          * just split and search again.  It'll get split again
533          * the next time though.
534          *
535          * If the extent we found is inside our range, we clear
536          * the desired bit on it.
537          */
538
539         if (state->start < start) {
540                 prealloc = alloc_extent_state_atomic(prealloc);
541                 BUG_ON(!prealloc);
542                 err = split_state(tree, state, prealloc, start);
543                 BUG_ON(err == -EEXIST);
544                 prealloc = NULL;
545                 if (err)
546                         goto out;
547                 if (state->end <= end) {
548                         set |= clear_state_bit(tree, state, &bits, wake);
549                         if (last_end == (u64)-1)
550                                 goto out;
551                         start = last_end + 1;
552                 }
553                 goto search_again;
554         }
555         /*
556          * | ---- desired range ---- |
557          *                        | state |
558          * We need to split the extent, and clear the bit
559          * on the first half
560          */
561         if (state->start <= end && state->end > end) {
562                 prealloc = alloc_extent_state_atomic(prealloc);
563                 BUG_ON(!prealloc);
564                 err = split_state(tree, state, prealloc, end + 1);
565                 BUG_ON(err == -EEXIST);
566                 if (wake)
567                         wake_up(&state->wq);
568
569                 set |= clear_state_bit(tree, prealloc, &bits, wake);
570
571                 prealloc = NULL;
572                 goto out;
573         }
574
575         if (state->end < end && prealloc && !need_resched())
576                 next_node = rb_next(&state->rb_node);
577         else
578                 next_node = NULL;
579
580         set |= clear_state_bit(tree, state, &bits, wake);
581         if (last_end == (u64)-1)
582                 goto out;
583         start = last_end + 1;
584         if (start <= end && next_node) {
585                 state = rb_entry(next_node, struct extent_state,
586                                  rb_node);
587                 if (state->start == start)
588                         goto hit_next;
589         }
590         goto search_again;
591
592 out:
593         spin_unlock(&tree->lock);
594         if (prealloc)
595                 free_extent_state(prealloc);
596
597         return set;
598
599 search_again:
600         if (start > end)
601                 goto out;
602         spin_unlock(&tree->lock);
603         if (mask & __GFP_WAIT)
604                 cond_resched();
605         goto again;
606 }
607
608 static int wait_on_state(struct extent_io_tree *tree,
609                          struct extent_state *state)
610                 __releases(tree->lock)
611                 __acquires(tree->lock)
612 {
613         DEFINE_WAIT(wait);
614         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
615         spin_unlock(&tree->lock);
616         schedule();
617         spin_lock(&tree->lock);
618         finish_wait(&state->wq, &wait);
619         return 0;
620 }
621
622 /*
623  * waits for one or more bits to clear on a range in the state tree.
624  * The range [start, end] is inclusive.
625  * The tree lock is taken by this function
626  */
627 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
628 {
629         struct extent_state *state;
630         struct rb_node *node;
631
632         spin_lock(&tree->lock);
633 again:
634         while (1) {
635                 /*
636                  * this search will find all the extents that end after
637                  * our range starts
638                  */
639                 node = tree_search(tree, start);
640                 if (!node)
641                         break;
642
643                 state = rb_entry(node, struct extent_state, rb_node);
644
645                 if (state->start > end)
646                         goto out;
647
648                 if (state->state & bits) {
649                         start = state->start;
650                         atomic_inc(&state->refs);
651                         wait_on_state(tree, state);
652                         free_extent_state(state);
653                         goto again;
654                 }
655                 start = state->end + 1;
656
657                 if (start > end)
658                         break;
659
660                 if (need_resched()) {
661                         spin_unlock(&tree->lock);
662                         cond_resched();
663                         spin_lock(&tree->lock);
664                 }
665         }
666 out:
667         spin_unlock(&tree->lock);
668         return 0;
669 }
670
671 static int set_state_bits(struct extent_io_tree *tree,
672                            struct extent_state *state,
673                            int *bits)
674 {
675         int ret;
676         int bits_to_set = *bits & ~EXTENT_CTLBITS;
677
678         ret = set_state_cb(tree, state, bits);
679         if (ret)
680                 return ret;
681         if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
682                 u64 range = state->end - state->start + 1;
683                 tree->dirty_bytes += range;
684         }
685         state->state |= bits_to_set;
686
687         return 0;
688 }
689
690 static void cache_state(struct extent_state *state,
691                         struct extent_state **cached_ptr)
692 {
693         if (cached_ptr && !(*cached_ptr)) {
694                 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
695                         *cached_ptr = state;
696                         atomic_inc(&state->refs);
697                 }
698         }
699 }
700
701 static void uncache_state(struct extent_state **cached_ptr)
702 {
703         if (cached_ptr && (*cached_ptr)) {
704                 struct extent_state *state = *cached_ptr;
705                 *cached_ptr = NULL;
706                 free_extent_state(state);
707         }
708 }
709
710 /*
711  * set some bits on a range in the tree.  This may require allocations or
712  * sleeping, so the gfp mask is used to indicate what is allowed.
713  *
714  * If any of the exclusive bits are set, this will fail with -EEXIST if some
715  * part of the range already has the desired bits set.  The start of the
716  * existing range is returned in failed_start in this case.
717  *
718  * [start, end] is inclusive This takes the tree lock.
719  */
720
721 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
722                    int bits, int exclusive_bits, u64 *failed_start,
723                    struct extent_state **cached_state, gfp_t mask)
724 {
725         struct extent_state *state;
726         struct extent_state *prealloc = NULL;
727         struct rb_node *node;
728         int err = 0;
729         u64 last_start;
730         u64 last_end;
731
732         bits |= EXTENT_FIRST_DELALLOC;
733 again:
734         if (!prealloc && (mask & __GFP_WAIT)) {
735                 prealloc = alloc_extent_state(mask);
736                 BUG_ON(!prealloc);
737         }
738
739         spin_lock(&tree->lock);
740         if (cached_state && *cached_state) {
741                 state = *cached_state;
742                 if (state->start <= start && state->end > start &&
743                     state->tree) {
744                         node = &state->rb_node;
745                         goto hit_next;
746                 }
747         }
748         /*
749          * this search will find all the extents that end after
750          * our range starts.
751          */
752         node = tree_search(tree, start);
753         if (!node) {
754                 prealloc = alloc_extent_state_atomic(prealloc);
755                 BUG_ON(!prealloc);
756                 err = insert_state(tree, prealloc, start, end, &bits);
757                 prealloc = NULL;
758                 BUG_ON(err == -EEXIST);
759                 goto out;
760         }
761         state = rb_entry(node, struct extent_state, rb_node);
762 hit_next:
763         last_start = state->start;
764         last_end = state->end;
765
766         /*
767          * | ---- desired range ---- |
768          * | state |
769          *
770          * Just lock what we found and keep going
771          */
772         if (state->start == start && state->end <= end) {
773                 struct rb_node *next_node;
774                 if (state->state & exclusive_bits) {
775                         *failed_start = state->start;
776                         err = -EEXIST;
777                         goto out;
778                 }
779
780                 err = set_state_bits(tree, state, &bits);
781                 if (err)
782                         goto out;
783
784                 cache_state(state, cached_state);
785                 merge_state(tree, state);
786                 if (last_end == (u64)-1)
787                         goto out;
788
789                 start = last_end + 1;
790                 next_node = rb_next(&state->rb_node);
791                 if (next_node && start < end && prealloc && !need_resched()) {
792                         state = rb_entry(next_node, struct extent_state,
793                                          rb_node);
794                         if (state->start == start)
795                                 goto hit_next;
796                 }
797                 goto search_again;
798         }
799
800         /*
801          *     | ---- desired range ---- |
802          * | state |
803          *   or
804          * | ------------- state -------------- |
805          *
806          * We need to split the extent we found, and may flip bits on
807          * second half.
808          *
809          * If the extent we found extends past our
810          * range, we just split and search again.  It'll get split
811          * again the next time though.
812          *
813          * If the extent we found is inside our range, we set the
814          * desired bit on it.
815          */
816         if (state->start < start) {
817                 if (state->state & exclusive_bits) {
818                         *failed_start = start;
819                         err = -EEXIST;
820                         goto out;
821                 }
822
823                 prealloc = alloc_extent_state_atomic(prealloc);
824                 BUG_ON(!prealloc);
825                 err = split_state(tree, state, prealloc, start);
826                 BUG_ON(err == -EEXIST);
827                 prealloc = NULL;
828                 if (err)
829                         goto out;
830                 if (state->end <= end) {
831                         err = set_state_bits(tree, state, &bits);
832                         if (err)
833                                 goto out;
834                         cache_state(state, cached_state);
835                         merge_state(tree, state);
836                         if (last_end == (u64)-1)
837                                 goto out;
838                         start = last_end + 1;
839                 }
840                 goto search_again;
841         }
842         /*
843          * | ---- desired range ---- |
844          *     | state | or               | state |
845          *
846          * There's a hole, we need to insert something in it and
847          * ignore the extent we found.
848          */
849         if (state->start > start) {
850                 u64 this_end;
851                 if (end < last_start)
852                         this_end = end;
853                 else
854                         this_end = last_start - 1;
855
856                 prealloc = alloc_extent_state_atomic(prealloc);
857                 BUG_ON(!prealloc);
858
859                 /*
860                  * Avoid to free 'prealloc' if it can be merged with
861                  * the later extent.
862                  */
863                 err = insert_state(tree, prealloc, start, this_end,
864                                    &bits);
865                 BUG_ON(err == -EEXIST);
866                 if (err) {
867                         free_extent_state(prealloc);
868                         prealloc = NULL;
869                         goto out;
870                 }
871                 cache_state(prealloc, cached_state);
872                 prealloc = NULL;
873                 start = this_end + 1;
874                 goto search_again;
875         }
876         /*
877          * | ---- desired range ---- |
878          *                        | state |
879          * We need to split the extent, and set the bit
880          * on the first half
881          */
882         if (state->start <= end && state->end > end) {
883                 if (state->state & exclusive_bits) {
884                         *failed_start = start;
885                         err = -EEXIST;
886                         goto out;
887                 }
888
889                 prealloc = alloc_extent_state_atomic(prealloc);
890                 BUG_ON(!prealloc);
891                 err = split_state(tree, state, prealloc, end + 1);
892                 BUG_ON(err == -EEXIST);
893
894                 err = set_state_bits(tree, prealloc, &bits);
895                 if (err) {
896                         prealloc = NULL;
897                         goto out;
898                 }
899                 cache_state(prealloc, cached_state);
900                 merge_state(tree, prealloc);
901                 prealloc = NULL;
902                 goto out;
903         }
904
905         goto search_again;
906
907 out:
908         spin_unlock(&tree->lock);
909         if (prealloc)
910                 free_extent_state(prealloc);
911
912         return err;
913
914 search_again:
915         if (start > end)
916                 goto out;
917         spin_unlock(&tree->lock);
918         if (mask & __GFP_WAIT)
919                 cond_resched();
920         goto again;
921 }
922
923 /* wrappers around set/clear extent bit */
924 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
925                      gfp_t mask)
926 {
927         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
928                               NULL, mask);
929 }
930
931 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
932                     int bits, gfp_t mask)
933 {
934         return set_extent_bit(tree, start, end, bits, 0, NULL,
935                               NULL, mask);
936 }
937
938 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
939                       int bits, gfp_t mask)
940 {
941         return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
942 }
943
944 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
945                         struct extent_state **cached_state, gfp_t mask)
946 {
947         return set_extent_bit(tree, start, end,
948                               EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
949                               0, NULL, cached_state, mask);
950 }
951
952 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
953                        gfp_t mask)
954 {
955         return clear_extent_bit(tree, start, end,
956                                 EXTENT_DIRTY | EXTENT_DELALLOC |
957                                 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
958 }
959
960 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
961                      gfp_t mask)
962 {
963         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
964                               NULL, mask);
965 }
966
967 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
968                         struct extent_state **cached_state, gfp_t mask)
969 {
970         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
971                               NULL, cached_state, mask);
972 }
973
974 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
975                                  u64 end, struct extent_state **cached_state,
976                                  gfp_t mask)
977 {
978         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
979                                 cached_state, mask);
980 }
981
982 /*
983  * either insert or lock state struct between start and end use mask to tell
984  * us if waiting is desired.
985  */
986 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
987                      int bits, struct extent_state **cached_state, gfp_t mask)
988 {
989         int err;
990         u64 failed_start;
991         while (1) {
992                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
993                                      EXTENT_LOCKED, &failed_start,
994                                      cached_state, mask);
995                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
996                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
997                         start = failed_start;
998                 } else {
999                         break;
1000                 }
1001                 WARN_ON(start > end);
1002         }
1003         return err;
1004 }
1005
1006 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1007 {
1008         return lock_extent_bits(tree, start, end, 0, NULL, mask);
1009 }
1010
1011 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1012                     gfp_t mask)
1013 {
1014         int err;
1015         u64 failed_start;
1016
1017         err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1018                              &failed_start, NULL, mask);
1019         if (err == -EEXIST) {
1020                 if (failed_start > start)
1021                         clear_extent_bit(tree, start, failed_start - 1,
1022                                          EXTENT_LOCKED, 1, 0, NULL, mask);
1023                 return 0;
1024         }
1025         return 1;
1026 }
1027
1028 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1029                          struct extent_state **cached, gfp_t mask)
1030 {
1031         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1032                                 mask);
1033 }
1034
1035 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1036 {
1037         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1038                                 mask);
1039 }
1040
1041 /*
1042  * helper function to set both pages and extents in the tree writeback
1043  */
1044 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1045 {
1046         unsigned long index = start >> PAGE_CACHE_SHIFT;
1047         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1048         struct page *page;
1049
1050         while (index <= end_index) {
1051                 page = find_get_page(tree->mapping, index);
1052                 BUG_ON(!page);
1053                 set_page_writeback(page);
1054                 page_cache_release(page);
1055                 index++;
1056         }
1057         return 0;
1058 }
1059
1060 /*
1061  * find the first offset in the io tree with 'bits' set. zero is
1062  * returned if we find something, and *start_ret and *end_ret are
1063  * set to reflect the state struct that was found.
1064  *
1065  * If nothing was found, 1 is returned, < 0 on error
1066  */
1067 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1068                           u64 *start_ret, u64 *end_ret, int bits)
1069 {
1070         struct rb_node *node;
1071         struct extent_state *state;
1072         int ret = 1;
1073
1074         spin_lock(&tree->lock);
1075         /*
1076          * this search will find all the extents that end after
1077          * our range starts.
1078          */
1079         node = tree_search(tree, start);
1080         if (!node)
1081                 goto out;
1082
1083         while (1) {
1084                 state = rb_entry(node, struct extent_state, rb_node);
1085                 if (state->end >= start && (state->state & bits)) {
1086                         *start_ret = state->start;
1087                         *end_ret = state->end;
1088                         ret = 0;
1089                         break;
1090                 }
1091                 node = rb_next(node);
1092                 if (!node)
1093                         break;
1094         }
1095 out:
1096         spin_unlock(&tree->lock);
1097         return ret;
1098 }
1099
1100 /* find the first state struct with 'bits' set after 'start', and
1101  * return it.  tree->lock must be held.  NULL will returned if
1102  * nothing was found after 'start'
1103  */
1104 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1105                                                  u64 start, int bits)
1106 {
1107         struct rb_node *node;
1108         struct extent_state *state;
1109
1110         /*
1111          * this search will find all the extents that end after
1112          * our range starts.
1113          */
1114         node = tree_search(tree, start);
1115         if (!node)
1116                 goto out;
1117
1118         while (1) {
1119                 state = rb_entry(node, struct extent_state, rb_node);
1120                 if (state->end >= start && (state->state & bits))
1121                         return state;
1122
1123                 node = rb_next(node);
1124                 if (!node)
1125                         break;
1126         }
1127 out:
1128         return NULL;
1129 }
1130
1131 /*
1132  * find a contiguous range of bytes in the file marked as delalloc, not
1133  * more than 'max_bytes'.  start and end are used to return the range,
1134  *
1135  * 1 is returned if we find something, 0 if nothing was in the tree
1136  */
1137 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1138                                         u64 *start, u64 *end, u64 max_bytes,
1139                                         struct extent_state **cached_state)
1140 {
1141         struct rb_node *node;
1142         struct extent_state *state;
1143         u64 cur_start = *start;
1144         u64 found = 0;
1145         u64 total_bytes = 0;
1146
1147         spin_lock(&tree->lock);
1148
1149         /*
1150          * this search will find all the extents that end after
1151          * our range starts.
1152          */
1153         node = tree_search(tree, cur_start);
1154         if (!node) {
1155                 if (!found)
1156                         *end = (u64)-1;
1157                 goto out;
1158         }
1159
1160         while (1) {
1161                 state = rb_entry(node, struct extent_state, rb_node);
1162                 if (found && (state->start != cur_start ||
1163                               (state->state & EXTENT_BOUNDARY))) {
1164                         goto out;
1165                 }
1166                 if (!(state->state & EXTENT_DELALLOC)) {
1167                         if (!found)
1168                                 *end = state->end;
1169                         goto out;
1170                 }
1171                 if (!found) {
1172                         *start = state->start;
1173                         *cached_state = state;
1174                         atomic_inc(&state->refs);
1175                 }
1176                 found++;
1177                 *end = state->end;
1178                 cur_start = state->end + 1;
1179                 node = rb_next(node);
1180                 if (!node)
1181                         break;
1182                 total_bytes += state->end - state->start + 1;
1183                 if (total_bytes >= max_bytes)
1184                         break;
1185         }
1186 out:
1187         spin_unlock(&tree->lock);
1188         return found;
1189 }
1190
1191 static noinline int __unlock_for_delalloc(struct inode *inode,
1192                                           struct page *locked_page,
1193                                           u64 start, u64 end)
1194 {
1195         int ret;
1196         struct page *pages[16];
1197         unsigned long index = start >> PAGE_CACHE_SHIFT;
1198         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1199         unsigned long nr_pages = end_index - index + 1;
1200         int i;
1201
1202         if (index == locked_page->index && end_index == index)
1203                 return 0;
1204
1205         while (nr_pages > 0) {
1206                 ret = find_get_pages_contig(inode->i_mapping, index,
1207                                      min_t(unsigned long, nr_pages,
1208                                      ARRAY_SIZE(pages)), pages);
1209                 for (i = 0; i < ret; i++) {
1210                         if (pages[i] != locked_page)
1211                                 unlock_page(pages[i]);
1212                         page_cache_release(pages[i]);
1213                 }
1214                 nr_pages -= ret;
1215                 index += ret;
1216                 cond_resched();
1217         }
1218         return 0;
1219 }
1220
1221 static noinline int lock_delalloc_pages(struct inode *inode,
1222                                         struct page *locked_page,
1223                                         u64 delalloc_start,
1224                                         u64 delalloc_end)
1225 {
1226         unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1227         unsigned long start_index = index;
1228         unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1229         unsigned long pages_locked = 0;
1230         struct page *pages[16];
1231         unsigned long nrpages;
1232         int ret;
1233         int i;
1234
1235         /* the caller is responsible for locking the start index */
1236         if (index == locked_page->index && index == end_index)
1237                 return 0;
1238
1239         /* skip the page at the start index */
1240         nrpages = end_index - index + 1;
1241         while (nrpages > 0) {
1242                 ret = find_get_pages_contig(inode->i_mapping, index,
1243                                      min_t(unsigned long,
1244                                      nrpages, ARRAY_SIZE(pages)), pages);
1245                 if (ret == 0) {
1246                         ret = -EAGAIN;
1247                         goto done;
1248                 }
1249                 /* now we have an array of pages, lock them all */
1250                 for (i = 0; i < ret; i++) {
1251                         /*
1252                          * the caller is taking responsibility for
1253                          * locked_page
1254                          */
1255                         if (pages[i] != locked_page) {
1256                                 lock_page(pages[i]);
1257                                 if (!PageDirty(pages[i]) ||
1258                                     pages[i]->mapping != inode->i_mapping) {
1259                                         ret = -EAGAIN;
1260                                         unlock_page(pages[i]);
1261                                         page_cache_release(pages[i]);
1262                                         goto done;
1263                                 }
1264                         }
1265                         page_cache_release(pages[i]);
1266                         pages_locked++;
1267                 }
1268                 nrpages -= ret;
1269                 index += ret;
1270                 cond_resched();
1271         }
1272         ret = 0;
1273 done:
1274         if (ret && pages_locked) {
1275                 __unlock_for_delalloc(inode, locked_page,
1276                               delalloc_start,
1277                               ((u64)(start_index + pages_locked - 1)) <<
1278                               PAGE_CACHE_SHIFT);
1279         }
1280         return ret;
1281 }
1282
1283 /*
1284  * find a contiguous range of bytes in the file marked as delalloc, not
1285  * more than 'max_bytes'.  start and end are used to return the range,
1286  *
1287  * 1 is returned if we find something, 0 if nothing was in the tree
1288  */
1289 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1290                                              struct extent_io_tree *tree,
1291                                              struct page *locked_page,
1292                                              u64 *start, u64 *end,
1293                                              u64 max_bytes)
1294 {
1295         u64 delalloc_start;
1296         u64 delalloc_end;
1297         u64 found;
1298         struct extent_state *cached_state = NULL;
1299         int ret;
1300         int loops = 0;
1301
1302 again:
1303         /* step one, find a bunch of delalloc bytes starting at start */
1304         delalloc_start = *start;
1305         delalloc_end = 0;
1306         found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1307                                     max_bytes, &cached_state);
1308         if (!found || delalloc_end <= *start) {
1309                 *start = delalloc_start;
1310                 *end = delalloc_end;
1311                 free_extent_state(cached_state);
1312                 return found;
1313         }
1314
1315         /*
1316          * start comes from the offset of locked_page.  We have to lock
1317          * pages in order, so we can't process delalloc bytes before
1318          * locked_page
1319          */
1320         if (delalloc_start < *start)
1321                 delalloc_start = *start;
1322
1323         /*
1324          * make sure to limit the number of pages we try to lock down
1325          * if we're looping.
1326          */
1327         if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1328                 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1329
1330         /* step two, lock all the pages after the page that has start */
1331         ret = lock_delalloc_pages(inode, locked_page,
1332                                   delalloc_start, delalloc_end);
1333         if (ret == -EAGAIN) {
1334                 /* some of the pages are gone, lets avoid looping by
1335                  * shortening the size of the delalloc range we're searching
1336                  */
1337                 free_extent_state(cached_state);
1338                 if (!loops) {
1339                         unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1340                         max_bytes = PAGE_CACHE_SIZE - offset;
1341                         loops = 1;
1342                         goto again;
1343                 } else {
1344                         found = 0;
1345                         goto out_failed;
1346                 }
1347         }
1348         BUG_ON(ret);
1349
1350         /* step three, lock the state bits for the whole range */
1351         lock_extent_bits(tree, delalloc_start, delalloc_end,
1352                          0, &cached_state, GFP_NOFS);
1353
1354         /* then test to make sure it is all still delalloc */
1355         ret = test_range_bit(tree, delalloc_start, delalloc_end,
1356                              EXTENT_DELALLOC, 1, cached_state);
1357         if (!ret) {
1358                 unlock_extent_cached(tree, delalloc_start, delalloc_end,
1359                                      &cached_state, GFP_NOFS);
1360                 __unlock_for_delalloc(inode, locked_page,
1361                               delalloc_start, delalloc_end);
1362                 cond_resched();
1363                 goto again;
1364         }
1365         free_extent_state(cached_state);
1366         *start = delalloc_start;
1367         *end = delalloc_end;
1368 out_failed:
1369         return found;
1370 }
1371
1372 int extent_clear_unlock_delalloc(struct inode *inode,
1373                                 struct extent_io_tree *tree,
1374                                 u64 start, u64 end, struct page *locked_page,
1375                                 unsigned long op)
1376 {
1377         int ret;
1378         struct page *pages[16];
1379         unsigned long index = start >> PAGE_CACHE_SHIFT;
1380         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1381         unsigned long nr_pages = end_index - index + 1;
1382         int i;
1383         int clear_bits = 0;
1384
1385         if (op & EXTENT_CLEAR_UNLOCK)
1386                 clear_bits |= EXTENT_LOCKED;
1387         if (op & EXTENT_CLEAR_DIRTY)
1388                 clear_bits |= EXTENT_DIRTY;
1389
1390         if (op & EXTENT_CLEAR_DELALLOC)
1391                 clear_bits |= EXTENT_DELALLOC;
1392
1393         clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1394         if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1395                     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1396                     EXTENT_SET_PRIVATE2)))
1397                 return 0;
1398
1399         while (nr_pages > 0) {
1400                 ret = find_get_pages_contig(inode->i_mapping, index,
1401                                      min_t(unsigned long,
1402                                      nr_pages, ARRAY_SIZE(pages)), pages);
1403                 for (i = 0; i < ret; i++) {
1404
1405                         if (op & EXTENT_SET_PRIVATE2)
1406                                 SetPagePrivate2(pages[i]);
1407
1408                         if (pages[i] == locked_page) {
1409                                 page_cache_release(pages[i]);
1410                                 continue;
1411                         }
1412                         if (op & EXTENT_CLEAR_DIRTY)
1413                                 clear_page_dirty_for_io(pages[i]);
1414                         if (op & EXTENT_SET_WRITEBACK)
1415                                 set_page_writeback(pages[i]);
1416                         if (op & EXTENT_END_WRITEBACK)
1417                                 end_page_writeback(pages[i]);
1418                         if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1419                                 unlock_page(pages[i]);
1420                         page_cache_release(pages[i]);
1421                 }
1422                 nr_pages -= ret;
1423                 index += ret;
1424                 cond_resched();
1425         }
1426         return 0;
1427 }
1428
1429 /*
1430  * count the number of bytes in the tree that have a given bit(s)
1431  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1432  * cached.  The total number found is returned.
1433  */
1434 u64 count_range_bits(struct extent_io_tree *tree,
1435                      u64 *start, u64 search_end, u64 max_bytes,
1436                      unsigned long bits, int contig)
1437 {
1438         struct rb_node *node;
1439         struct extent_state *state;
1440         u64 cur_start = *start;
1441         u64 total_bytes = 0;
1442         u64 last = 0;
1443         int found = 0;
1444
1445         if (search_end <= cur_start) {
1446                 WARN_ON(1);
1447                 return 0;
1448         }
1449
1450         spin_lock(&tree->lock);
1451         if (cur_start == 0 && bits == EXTENT_DIRTY) {
1452                 total_bytes = tree->dirty_bytes;
1453                 goto out;
1454         }
1455         /*
1456          * this search will find all the extents that end after
1457          * our range starts.
1458          */
1459         node = tree_search(tree, cur_start);
1460         if (!node)
1461                 goto out;
1462
1463         while (1) {
1464                 state = rb_entry(node, struct extent_state, rb_node);
1465                 if (state->start > search_end)
1466                         break;
1467                 if (contig && found && state->start > last + 1)
1468                         break;
1469                 if (state->end >= cur_start && (state->state & bits) == bits) {
1470                         total_bytes += min(search_end, state->end) + 1 -
1471                                        max(cur_start, state->start);
1472                         if (total_bytes >= max_bytes)
1473                                 break;
1474                         if (!found) {
1475                                 *start = max(cur_start, state->start);
1476                                 found = 1;
1477                         }
1478                         last = state->end;
1479                 } else if (contig && found) {
1480                         break;
1481                 }
1482                 node = rb_next(node);
1483                 if (!node)
1484                         break;
1485         }
1486 out:
1487         spin_unlock(&tree->lock);
1488         return total_bytes;
1489 }
1490
1491 /*
1492  * set the private field for a given byte offset in the tree.  If there isn't
1493  * an extent_state there already, this does nothing.
1494  */
1495 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1496 {
1497         struct rb_node *node;
1498         struct extent_state *state;
1499         int ret = 0;
1500
1501         spin_lock(&tree->lock);
1502         /*
1503          * this search will find all the extents that end after
1504          * our range starts.
1505          */
1506         node = tree_search(tree, start);
1507         if (!node) {
1508                 ret = -ENOENT;
1509                 goto out;
1510         }
1511         state = rb_entry(node, struct extent_state, rb_node);
1512         if (state->start != start) {
1513                 ret = -ENOENT;
1514                 goto out;
1515         }
1516         state->private = private;
1517 out:
1518         spin_unlock(&tree->lock);
1519         return ret;
1520 }
1521
1522 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1523 {
1524         struct rb_node *node;
1525         struct extent_state *state;
1526         int ret = 0;
1527
1528         spin_lock(&tree->lock);
1529         /*
1530          * this search will find all the extents that end after
1531          * our range starts.
1532          */
1533         node = tree_search(tree, start);
1534         if (!node) {
1535                 ret = -ENOENT;
1536                 goto out;
1537         }
1538         state = rb_entry(node, struct extent_state, rb_node);
1539         if (state->start != start) {
1540                 ret = -ENOENT;
1541                 goto out;
1542         }
1543         *private = state->private;
1544 out:
1545         spin_unlock(&tree->lock);
1546         return ret;
1547 }
1548
1549 /*
1550  * searches a range in the state tree for a given mask.
1551  * If 'filled' == 1, this returns 1 only if every extent in the tree
1552  * has the bits set.  Otherwise, 1 is returned if any bit in the
1553  * range is found set.
1554  */
1555 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1556                    int bits, int filled, struct extent_state *cached)
1557 {
1558         struct extent_state *state = NULL;
1559         struct rb_node *node;
1560         int bitset = 0;
1561
1562         spin_lock(&tree->lock);
1563         if (cached && cached->tree && cached->start <= start &&
1564             cached->end > start)
1565                 node = &cached->rb_node;
1566         else
1567                 node = tree_search(tree, start);
1568         while (node && start <= end) {
1569                 state = rb_entry(node, struct extent_state, rb_node);
1570
1571                 if (filled && state->start > start) {
1572                         bitset = 0;
1573                         break;
1574                 }
1575
1576                 if (state->start > end)
1577                         break;
1578
1579                 if (state->state & bits) {
1580                         bitset = 1;
1581                         if (!filled)
1582                                 break;
1583                 } else if (filled) {
1584                         bitset = 0;
1585                         break;
1586                 }
1587
1588                 if (state->end == (u64)-1)
1589                         break;
1590
1591                 start = state->end + 1;
1592                 if (start > end)
1593                         break;
1594                 node = rb_next(node);
1595                 if (!node) {
1596                         if (filled)
1597                                 bitset = 0;
1598                         break;
1599                 }
1600         }
1601         spin_unlock(&tree->lock);
1602         return bitset;
1603 }
1604
1605 /*
1606  * helper function to set a given page up to date if all the
1607  * extents in the tree for that page are up to date
1608  */
1609 static int check_page_uptodate(struct extent_io_tree *tree,
1610                                struct page *page)
1611 {
1612         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1613         u64 end = start + PAGE_CACHE_SIZE - 1;
1614         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1615                 SetPageUptodate(page);
1616         return 0;
1617 }
1618
1619 /*
1620  * helper function to unlock a page if all the extents in the tree
1621  * for that page are unlocked
1622  */
1623 static int check_page_locked(struct extent_io_tree *tree,
1624                              struct page *page)
1625 {
1626         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1627         u64 end = start + PAGE_CACHE_SIZE - 1;
1628         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1629                 unlock_page(page);
1630         return 0;
1631 }
1632
1633 /*
1634  * helper function to end page writeback if all the extents
1635  * in the tree for that page are done with writeback
1636  */
1637 static int check_page_writeback(struct extent_io_tree *tree,
1638                              struct page *page)
1639 {
1640         end_page_writeback(page);
1641         return 0;
1642 }
1643
1644 /* lots and lots of room for performance fixes in the end_bio funcs */
1645
1646 /*
1647  * after a writepage IO is done, we need to:
1648  * clear the uptodate bits on error
1649  * clear the writeback bits in the extent tree for this IO
1650  * end_page_writeback if the page has no more pending IO
1651  *
1652  * Scheduling is not allowed, so the extent state tree is expected
1653  * to have one and only one object corresponding to this IO.
1654  */
1655 static void end_bio_extent_writepage(struct bio *bio, int err)
1656 {
1657         int uptodate = err == 0;
1658         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1659         struct extent_io_tree *tree;
1660         u64 start;
1661         u64 end;
1662         int whole_page;
1663         int ret;
1664
1665         do {
1666                 struct page *page = bvec->bv_page;
1667                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1668
1669                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1670                          bvec->bv_offset;
1671                 end = start + bvec->bv_len - 1;
1672
1673                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1674                         whole_page = 1;
1675                 else
1676                         whole_page = 0;
1677
1678                 if (--bvec >= bio->bi_io_vec)
1679                         prefetchw(&bvec->bv_page->flags);
1680                 if (tree->ops && tree->ops->writepage_end_io_hook) {
1681                         ret = tree->ops->writepage_end_io_hook(page, start,
1682                                                        end, NULL, uptodate);
1683                         if (ret)
1684                                 uptodate = 0;
1685                 }
1686
1687                 if (!uptodate && tree->ops &&
1688                     tree->ops->writepage_io_failed_hook) {
1689                         ret = tree->ops->writepage_io_failed_hook(bio, page,
1690                                                          start, end, NULL);
1691                         if (ret == 0) {
1692                                 uptodate = (err == 0);
1693                                 continue;
1694                         }
1695                 }
1696
1697                 if (!uptodate) {
1698                         clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1699                         ClearPageUptodate(page);
1700                         SetPageError(page);
1701                 }
1702
1703                 if (whole_page)
1704                         end_page_writeback(page);
1705                 else
1706                         check_page_writeback(tree, page);
1707         } while (bvec >= bio->bi_io_vec);
1708
1709         bio_put(bio);
1710 }
1711
1712 /*
1713  * after a readpage IO is done, we need to:
1714  * clear the uptodate bits on error
1715  * set the uptodate bits if things worked
1716  * set the page up to date if all extents in the tree are uptodate
1717  * clear the lock bit in the extent tree
1718  * unlock the page if there are no other extents locked for it
1719  *
1720  * Scheduling is not allowed, so the extent state tree is expected
1721  * to have one and only one object corresponding to this IO.
1722  */
1723 static void end_bio_extent_readpage(struct bio *bio, int err)
1724 {
1725         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1726         struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1727         struct bio_vec *bvec = bio->bi_io_vec;
1728         struct extent_io_tree *tree;
1729         u64 start;
1730         u64 end;
1731         int whole_page;
1732         int ret;
1733
1734         if (err)
1735                 uptodate = 0;
1736
1737         do {
1738                 struct page *page = bvec->bv_page;
1739                 struct extent_state *cached = NULL;
1740                 struct extent_state *state;
1741
1742                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1743
1744                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1745                         bvec->bv_offset;
1746                 end = start + bvec->bv_len - 1;
1747
1748                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1749                         whole_page = 1;
1750                 else
1751                         whole_page = 0;
1752
1753                 if (++bvec <= bvec_end)
1754                         prefetchw(&bvec->bv_page->flags);
1755
1756                 spin_lock(&tree->lock);
1757                 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
1758                 if (state && state->start == start) {
1759                         /*
1760                          * take a reference on the state, unlock will drop
1761                          * the ref
1762                          */
1763                         cache_state(state, &cached);
1764                 }
1765                 spin_unlock(&tree->lock);
1766
1767                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1768                         ret = tree->ops->readpage_end_io_hook(page, start, end,
1769                                                               state);
1770                         if (ret)
1771                                 uptodate = 0;
1772                 }
1773                 if (!uptodate && tree->ops &&
1774                     tree->ops->readpage_io_failed_hook) {
1775                         ret = tree->ops->readpage_io_failed_hook(bio, page,
1776                                                          start, end, NULL);
1777                         if (ret == 0) {
1778                                 uptodate =
1779                                         test_bit(BIO_UPTODATE, &bio->bi_flags);
1780                                 if (err)
1781                                         uptodate = 0;
1782                                 uncache_state(&cached);
1783                                 continue;
1784                         }
1785                 }
1786
1787                 if (uptodate) {
1788                         set_extent_uptodate(tree, start, end, &cached,
1789                                             GFP_ATOMIC);
1790                 }
1791                 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
1792
1793                 if (whole_page) {
1794                         if (uptodate) {
1795                                 SetPageUptodate(page);
1796                         } else {
1797                                 ClearPageUptodate(page);
1798                                 SetPageError(page);
1799                         }
1800                         unlock_page(page);
1801                 } else {
1802                         if (uptodate) {
1803                                 check_page_uptodate(tree, page);
1804                         } else {
1805                                 ClearPageUptodate(page);
1806                                 SetPageError(page);
1807                         }
1808                         check_page_locked(tree, page);
1809                 }
1810         } while (bvec <= bvec_end);
1811
1812         bio_put(bio);
1813 }
1814
1815 struct bio *
1816 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1817                 gfp_t gfp_flags)
1818 {
1819         struct bio *bio;
1820
1821         bio = bio_alloc(gfp_flags, nr_vecs);
1822
1823         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1824                 while (!bio && (nr_vecs /= 2))
1825                         bio = bio_alloc(gfp_flags, nr_vecs);
1826         }
1827
1828         if (bio) {
1829                 bio->bi_size = 0;
1830                 bio->bi_bdev = bdev;
1831                 bio->bi_sector = first_sector;
1832         }
1833         return bio;
1834 }
1835
1836 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1837                           unsigned long bio_flags)
1838 {
1839         int ret = 0;
1840         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1841         struct page *page = bvec->bv_page;
1842         struct extent_io_tree *tree = bio->bi_private;
1843         u64 start;
1844
1845         start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1846
1847         bio->bi_private = NULL;
1848
1849         bio_get(bio);
1850
1851         if (tree->ops && tree->ops->submit_bio_hook)
1852                 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1853                                            mirror_num, bio_flags, start);
1854         else
1855                 submit_bio(rw, bio);
1856         if (bio_flagged(bio, BIO_EOPNOTSUPP))
1857                 ret = -EOPNOTSUPP;
1858         bio_put(bio);
1859         return ret;
1860 }
1861
1862 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1863                               struct page *page, sector_t sector,
1864                               size_t size, unsigned long offset,
1865                               struct block_device *bdev,
1866                               struct bio **bio_ret,
1867                               unsigned long max_pages,
1868                               bio_end_io_t end_io_func,
1869                               int mirror_num,
1870                               unsigned long prev_bio_flags,
1871                               unsigned long bio_flags)
1872 {
1873         int ret = 0;
1874         struct bio *bio;
1875         int nr;
1876         int contig = 0;
1877         int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1878         int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1879         size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1880
1881         if (bio_ret && *bio_ret) {
1882                 bio = *bio_ret;
1883                 if (old_compressed)
1884                         contig = bio->bi_sector == sector;
1885                 else
1886                         contig = bio->bi_sector + (bio->bi_size >> 9) ==
1887                                 sector;
1888
1889                 if (prev_bio_flags != bio_flags || !contig ||
1890                     (tree->ops && tree->ops->merge_bio_hook &&
1891                      tree->ops->merge_bio_hook(page, offset, page_size, bio,
1892                                                bio_flags)) ||
1893                     bio_add_page(bio, page, page_size, offset) < page_size) {
1894                         ret = submit_one_bio(rw, bio, mirror_num,
1895                                              prev_bio_flags);
1896                         bio = NULL;
1897                 } else {
1898                         return 0;
1899                 }
1900         }
1901         if (this_compressed)
1902                 nr = BIO_MAX_PAGES;
1903         else
1904                 nr = bio_get_nr_vecs(bdev);
1905
1906         bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1907         if (!bio)
1908                 return -ENOMEM;
1909
1910         bio_add_page(bio, page, page_size, offset);
1911         bio->bi_end_io = end_io_func;
1912         bio->bi_private = tree;
1913
1914         if (bio_ret)
1915                 *bio_ret = bio;
1916         else
1917                 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1918
1919         return ret;
1920 }
1921
1922 void set_page_extent_mapped(struct page *page)
1923 {
1924         if (!PagePrivate(page)) {
1925                 SetPagePrivate(page);
1926                 page_cache_get(page);
1927                 set_page_private(page, EXTENT_PAGE_PRIVATE);
1928         }
1929 }
1930
1931 static void set_page_extent_head(struct page *page, unsigned long len)
1932 {
1933         WARN_ON(!PagePrivate(page));
1934         set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1935 }
1936
1937 /*
1938  * basic readpage implementation.  Locked extent state structs are inserted
1939  * into the tree that are removed when the IO is done (by the end_io
1940  * handlers)
1941  */
1942 static int __extent_read_full_page(struct extent_io_tree *tree,
1943                                    struct page *page,
1944                                    get_extent_t *get_extent,
1945                                    struct bio **bio, int mirror_num,
1946                                    unsigned long *bio_flags)
1947 {
1948         struct inode *inode = page->mapping->host;
1949         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1950         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1951         u64 end;
1952         u64 cur = start;
1953         u64 extent_offset;
1954         u64 last_byte = i_size_read(inode);
1955         u64 block_start;
1956         u64 cur_end;
1957         sector_t sector;
1958         struct extent_map *em;
1959         struct block_device *bdev;
1960         struct btrfs_ordered_extent *ordered;
1961         int ret;
1962         int nr = 0;
1963         size_t pg_offset = 0;
1964         size_t iosize;
1965         size_t disk_io_size;
1966         size_t blocksize = inode->i_sb->s_blocksize;
1967         unsigned long this_bio_flag = 0;
1968
1969         set_page_extent_mapped(page);
1970
1971         end = page_end;
1972         while (1) {
1973                 lock_extent(tree, start, end, GFP_NOFS);
1974                 ordered = btrfs_lookup_ordered_extent(inode, start);
1975                 if (!ordered)
1976                         break;
1977                 unlock_extent(tree, start, end, GFP_NOFS);
1978                 btrfs_start_ordered_extent(inode, ordered, 1);
1979                 btrfs_put_ordered_extent(ordered);
1980         }
1981
1982         if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1983                 char *userpage;
1984                 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1985
1986                 if (zero_offset) {
1987                         iosize = PAGE_CACHE_SIZE - zero_offset;
1988                         userpage = kmap_atomic(page, KM_USER0);
1989                         memset(userpage + zero_offset, 0, iosize);
1990                         flush_dcache_page(page);
1991                         kunmap_atomic(userpage, KM_USER0);
1992                 }
1993         }
1994         while (cur <= end) {
1995                 if (cur >= last_byte) {
1996                         char *userpage;
1997                         struct extent_state *cached = NULL;
1998
1999                         iosize = PAGE_CACHE_SIZE - pg_offset;
2000                         userpage = kmap_atomic(page, KM_USER0);
2001                         memset(userpage + pg_offset, 0, iosize);
2002                         flush_dcache_page(page);
2003                         kunmap_atomic(userpage, KM_USER0);
2004                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2005                                             &cached, GFP_NOFS);
2006                         unlock_extent_cached(tree, cur, cur + iosize - 1,
2007                                              &cached, GFP_NOFS);
2008                         break;
2009                 }
2010                 em = get_extent(inode, page, pg_offset, cur,
2011                                 end - cur + 1, 0);
2012                 if (IS_ERR_OR_NULL(em)) {
2013                         SetPageError(page);
2014                         unlock_extent(tree, cur, end, GFP_NOFS);
2015                         break;
2016                 }
2017                 extent_offset = cur - em->start;
2018                 BUG_ON(extent_map_end(em) <= cur);
2019                 BUG_ON(end < cur);
2020
2021                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2022                         this_bio_flag = EXTENT_BIO_COMPRESSED;
2023                         extent_set_compress_type(&this_bio_flag,
2024                                                  em->compress_type);
2025                 }
2026
2027                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2028                 cur_end = min(extent_map_end(em) - 1, end);
2029                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2030                 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2031                         disk_io_size = em->block_len;
2032                         sector = em->block_start >> 9;
2033                 } else {
2034                         sector = (em->block_start + extent_offset) >> 9;
2035                         disk_io_size = iosize;
2036                 }
2037                 bdev = em->bdev;
2038                 block_start = em->block_start;
2039                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2040                         block_start = EXTENT_MAP_HOLE;
2041                 free_extent_map(em);
2042                 em = NULL;
2043
2044                 /* we've found a hole, just zero and go on */
2045                 if (block_start == EXTENT_MAP_HOLE) {
2046                         char *userpage;
2047                         struct extent_state *cached = NULL;
2048
2049                         userpage = kmap_atomic(page, KM_USER0);
2050                         memset(userpage + pg_offset, 0, iosize);
2051                         flush_dcache_page(page);
2052                         kunmap_atomic(userpage, KM_USER0);
2053
2054                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2055                                             &cached, GFP_NOFS);
2056                         unlock_extent_cached(tree, cur, cur + iosize - 1,
2057                                              &cached, GFP_NOFS);
2058                         cur = cur + iosize;
2059                         pg_offset += iosize;
2060                         continue;
2061                 }
2062                 /* the get_extent function already copied into the page */
2063                 if (test_range_bit(tree, cur, cur_end,
2064                                    EXTENT_UPTODATE, 1, NULL)) {
2065                         check_page_uptodate(tree, page);
2066                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2067                         cur = cur + iosize;
2068                         pg_offset += iosize;
2069                         continue;
2070                 }
2071                 /* we have an inline extent but it didn't get marked up
2072                  * to date.  Error out
2073                  */
2074                 if (block_start == EXTENT_MAP_INLINE) {
2075                         SetPageError(page);
2076                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2077                         cur = cur + iosize;
2078                         pg_offset += iosize;
2079                         continue;
2080                 }
2081
2082                 ret = 0;
2083                 if (tree->ops && tree->ops->readpage_io_hook) {
2084                         ret = tree->ops->readpage_io_hook(page, cur,
2085                                                           cur + iosize - 1);
2086                 }
2087                 if (!ret) {
2088                         unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2089                         pnr -= page->index;
2090                         ret = submit_extent_page(READ, tree, page,
2091                                          sector, disk_io_size, pg_offset,
2092                                          bdev, bio, pnr,
2093                                          end_bio_extent_readpage, mirror_num,
2094                                          *bio_flags,
2095                                          this_bio_flag);
2096                         nr++;
2097                         *bio_flags = this_bio_flag;
2098                 }
2099                 if (ret)
2100                         SetPageError(page);
2101                 cur = cur + iosize;
2102                 pg_offset += iosize;
2103         }
2104         if (!nr) {
2105                 if (!PageError(page))
2106                         SetPageUptodate(page);
2107                 unlock_page(page);
2108         }
2109         return 0;
2110 }
2111
2112 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2113                             get_extent_t *get_extent)
2114 {
2115         struct bio *bio = NULL;
2116         unsigned long bio_flags = 0;
2117         int ret;
2118
2119         ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2120                                       &bio_flags);
2121         if (bio)
2122                 ret = submit_one_bio(READ, bio, 0, bio_flags);
2123         return ret;
2124 }
2125
2126 static noinline void update_nr_written(struct page *page,
2127                                       struct writeback_control *wbc,
2128                                       unsigned long nr_written)
2129 {
2130         wbc->nr_to_write -= nr_written;
2131         if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2132             wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2133                 page->mapping->writeback_index = page->index + nr_written;
2134 }
2135
2136 /*
2137  * the writepage semantics are similar to regular writepage.  extent
2138  * records are inserted to lock ranges in the tree, and as dirty areas
2139  * are found, they are marked writeback.  Then the lock bits are removed
2140  * and the end_io handler clears the writeback ranges
2141  */
2142 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2143                               void *data)
2144 {
2145         struct inode *inode = page->mapping->host;
2146         struct extent_page_data *epd = data;
2147         struct extent_io_tree *tree = epd->tree;
2148         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2149         u64 delalloc_start;
2150         u64 page_end = start + PAGE_CACHE_SIZE - 1;
2151         u64 end;
2152         u64 cur = start;
2153         u64 extent_offset;
2154         u64 last_byte = i_size_read(inode);
2155         u64 block_start;
2156         u64 iosize;
2157         sector_t sector;
2158         struct extent_state *cached_state = NULL;
2159         struct extent_map *em;
2160         struct block_device *bdev;
2161         int ret;
2162         int nr = 0;
2163         size_t pg_offset = 0;
2164         size_t blocksize;
2165         loff_t i_size = i_size_read(inode);
2166         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2167         u64 nr_delalloc;
2168         u64 delalloc_end;
2169         int page_started;
2170         int compressed;
2171         int write_flags;
2172         unsigned long nr_written = 0;
2173
2174         if (wbc->sync_mode == WB_SYNC_ALL)
2175                 write_flags = WRITE_SYNC;
2176         else
2177                 write_flags = WRITE;
2178
2179         trace___extent_writepage(page, inode, wbc);
2180
2181         WARN_ON(!PageLocked(page));
2182         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2183         if (page->index > end_index ||
2184            (page->index == end_index && !pg_offset)) {
2185                 page->mapping->a_ops->invalidatepage(page, 0);
2186                 unlock_page(page);
2187                 return 0;
2188         }
2189
2190         if (page->index == end_index) {
2191                 char *userpage;
2192
2193                 userpage = kmap_atomic(page, KM_USER0);
2194                 memset(userpage + pg_offset, 0,
2195                        PAGE_CACHE_SIZE - pg_offset);
2196                 kunmap_atomic(userpage, KM_USER0);
2197                 flush_dcache_page(page);
2198         }
2199         pg_offset = 0;
2200
2201         set_page_extent_mapped(page);
2202
2203         delalloc_start = start;
2204         delalloc_end = 0;
2205         page_started = 0;
2206         if (!epd->extent_locked) {
2207                 u64 delalloc_to_write = 0;
2208                 /*
2209                  * make sure the wbc mapping index is at least updated
2210                  * to this page.
2211                  */
2212                 update_nr_written(page, wbc, 0);
2213
2214                 while (delalloc_end < page_end) {
2215                         nr_delalloc = find_lock_delalloc_range(inode, tree,
2216                                                        page,
2217                                                        &delalloc_start,
2218                                                        &delalloc_end,
2219                                                        128 * 1024 * 1024);
2220                         if (nr_delalloc == 0) {
2221                                 delalloc_start = delalloc_end + 1;
2222                                 continue;
2223                         }
2224                         tree->ops->fill_delalloc(inode, page, delalloc_start,
2225                                                  delalloc_end, &page_started,
2226                                                  &nr_written);
2227                         /*
2228                          * delalloc_end is already one less than the total
2229                          * length, so we don't subtract one from
2230                          * PAGE_CACHE_SIZE
2231                          */
2232                         delalloc_to_write += (delalloc_end - delalloc_start +
2233                                               PAGE_CACHE_SIZE) >>
2234                                               PAGE_CACHE_SHIFT;
2235                         delalloc_start = delalloc_end + 1;
2236                 }
2237                 if (wbc->nr_to_write < delalloc_to_write) {
2238                         int thresh = 8192;
2239
2240                         if (delalloc_to_write < thresh * 2)
2241                                 thresh = delalloc_to_write;
2242                         wbc->nr_to_write = min_t(u64, delalloc_to_write,
2243                                                  thresh);
2244                 }
2245
2246                 /* did the fill delalloc function already unlock and start
2247                  * the IO?
2248                  */
2249                 if (page_started) {
2250                         ret = 0;
2251                         /*
2252                          * we've unlocked the page, so we can't update
2253                          * the mapping's writeback index, just update
2254                          * nr_to_write.
2255                          */
2256                         wbc->nr_to_write -= nr_written;
2257                         goto done_unlocked;
2258                 }
2259         }
2260         if (tree->ops && tree->ops->writepage_start_hook) {
2261                 ret = tree->ops->writepage_start_hook(page, start,
2262                                                       page_end);
2263                 if (ret == -EAGAIN) {
2264                         redirty_page_for_writepage(wbc, page);
2265                         update_nr_written(page, wbc, nr_written);
2266                         unlock_page(page);
2267                         ret = 0;
2268                         goto done_unlocked;
2269                 }
2270         }
2271
2272         /*
2273          * we don't want to touch the inode after unlocking the page,
2274          * so we update the mapping writeback index now
2275          */
2276         update_nr_written(page, wbc, nr_written + 1);
2277
2278         end = page_end;
2279         if (last_byte <= start) {
2280                 if (tree->ops && tree->ops->writepage_end_io_hook)
2281                         tree->ops->writepage_end_io_hook(page, start,
2282                                                          page_end, NULL, 1);
2283                 goto done;
2284         }
2285
2286         blocksize = inode->i_sb->s_blocksize;
2287
2288         while (cur <= end) {
2289                 if (cur >= last_byte) {
2290                         if (tree->ops && tree->ops->writepage_end_io_hook)
2291                                 tree->ops->writepage_end_io_hook(page, cur,
2292                                                          page_end, NULL, 1);
2293                         break;
2294                 }
2295                 em = epd->get_extent(inode, page, pg_offset, cur,
2296                                      end - cur + 1, 1);
2297                 if (IS_ERR_OR_NULL(em)) {
2298                         SetPageError(page);
2299                         break;
2300                 }
2301
2302                 extent_offset = cur - em->start;
2303                 BUG_ON(extent_map_end(em) <= cur);
2304                 BUG_ON(end < cur);
2305                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2306                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2307                 sector = (em->block_start + extent_offset) >> 9;
2308                 bdev = em->bdev;
2309                 block_start = em->block_start;
2310                 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2311                 free_extent_map(em);
2312                 em = NULL;
2313
2314                 /*
2315                  * compressed and inline extents are written through other
2316                  * paths in the FS
2317                  */
2318                 if (compressed || block_start == EXTENT_MAP_HOLE ||
2319                     block_start == EXTENT_MAP_INLINE) {
2320                         /*
2321                          * end_io notification does not happen here for
2322                          * compressed extents
2323                          */
2324                         if (!compressed && tree->ops &&
2325                             tree->ops->writepage_end_io_hook)
2326                                 tree->ops->writepage_end_io_hook(page, cur,
2327                                                          cur + iosize - 1,
2328                                                          NULL, 1);
2329                         else if (compressed) {
2330                                 /* we don't want to end_page_writeback on
2331                                  * a compressed extent.  this happens
2332                                  * elsewhere
2333                                  */
2334                                 nr++;
2335                         }
2336
2337                         cur += iosize;
2338                         pg_offset += iosize;
2339                         continue;
2340                 }
2341                 /* leave this out until we have a page_mkwrite call */
2342                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2343                                    EXTENT_DIRTY, 0, NULL)) {
2344                         cur = cur + iosize;
2345                         pg_offset += iosize;
2346                         continue;
2347                 }
2348
2349                 if (tree->ops && tree->ops->writepage_io_hook) {
2350                         ret = tree->ops->writepage_io_hook(page, cur,
2351                                                 cur + iosize - 1);
2352                 } else {
2353                         ret = 0;
2354                 }
2355                 if (ret) {
2356                         SetPageError(page);
2357                 } else {
2358                         unsigned long max_nr = end_index + 1;
2359
2360                         set_range_writeback(tree, cur, cur + iosize - 1);
2361                         if (!PageWriteback(page)) {
2362                                 printk(KERN_ERR "btrfs warning page %lu not "
2363                                        "writeback, cur %llu end %llu\n",
2364                                        page->index, (unsigned long long)cur,
2365                                        (unsigned long long)end);
2366                         }
2367
2368                         ret = submit_extent_page(write_flags, tree, page,
2369                                                  sector, iosize, pg_offset,
2370                                                  bdev, &epd->bio, max_nr,
2371                                                  end_bio_extent_writepage,
2372                                                  0, 0, 0);
2373                         if (ret)
2374                                 SetPageError(page);
2375                 }
2376                 cur = cur + iosize;
2377                 pg_offset += iosize;
2378                 nr++;
2379         }
2380 done:
2381         if (nr == 0) {
2382                 /* make sure the mapping tag for page dirty gets cleared */
2383                 set_page_writeback(page);
2384                 end_page_writeback(page);
2385         }
2386         unlock_page(page);
2387
2388 done_unlocked:
2389
2390         /* drop our reference on any cached states */
2391         free_extent_state(cached_state);
2392         return 0;
2393 }
2394
2395 /**
2396  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2397  * @mapping: address space structure to write
2398  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2399  * @writepage: function called for each page
2400  * @data: data passed to writepage function
2401  *
2402  * If a page is already under I/O, write_cache_pages() skips it, even
2403  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2404  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2405  * and msync() need to guarantee that all the data which was dirty at the time
2406  * the call was made get new I/O started against them.  If wbc->sync_mode is
2407  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2408  * existing IO to complete.
2409  */
2410 static int extent_write_cache_pages(struct extent_io_tree *tree,
2411                              struct address_space *mapping,
2412                              struct writeback_control *wbc,
2413                              writepage_t writepage, void *data,
2414                              void (*flush_fn)(void *))
2415 {
2416         int ret = 0;
2417         int done = 0;
2418         int nr_to_write_done = 0;
2419         struct pagevec pvec;
2420         int nr_pages;
2421         pgoff_t index;
2422         pgoff_t end;            /* Inclusive */
2423         int scanned = 0;
2424         int tag;
2425
2426         pagevec_init(&pvec, 0);
2427         if (wbc->range_cyclic) {
2428                 index = mapping->writeback_index; /* Start from prev offset */
2429                 end = -1;
2430         } else {
2431                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2432                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2433                 scanned = 1;
2434         }
2435         if (wbc->sync_mode == WB_SYNC_ALL)
2436                 tag = PAGECACHE_TAG_TOWRITE;
2437         else
2438                 tag = PAGECACHE_TAG_DIRTY;
2439 retry:
2440         if (wbc->sync_mode == WB_SYNC_ALL)
2441                 tag_pages_for_writeback(mapping, index, end);
2442         while (!done && !nr_to_write_done && (index <= end) &&
2443                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2444                         min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2445                 unsigned i;
2446
2447                 scanned = 1;
2448                 for (i = 0; i < nr_pages; i++) {
2449                         struct page *page = pvec.pages[i];
2450
2451                         /*
2452                          * At this point we hold neither mapping->tree_lock nor
2453                          * lock on the page itself: the page may be truncated or
2454                          * invalidated (changing page->mapping to NULL), or even
2455                          * swizzled back from swapper_space to tmpfs file
2456                          * mapping
2457                          */
2458                         if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2459                                 tree->ops->write_cache_pages_lock_hook(page);
2460                         else
2461                                 lock_page(page);
2462
2463                         if (unlikely(page->mapping != mapping)) {
2464                                 unlock_page(page);
2465                                 continue;
2466                         }
2467
2468                         if (!wbc->range_cyclic && page->index > end) {
2469                                 done = 1;
2470                                 unlock_page(page);
2471                                 continue;
2472                         }
2473
2474                         if (wbc->sync_mode != WB_SYNC_NONE) {
2475                                 if (PageWriteback(page))
2476                                         flush_fn(data);
2477                                 wait_on_page_writeback(page);
2478                         }
2479
2480                         if (PageWriteback(page) ||
2481                             !clear_page_dirty_for_io(page)) {
2482                                 unlock_page(page);
2483                                 continue;
2484                         }
2485
2486                         ret = (*writepage)(page, wbc, data);
2487
2488                         if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2489                                 unlock_page(page);
2490                                 ret = 0;
2491                         }
2492                         if (ret)
2493                                 done = 1;
2494
2495                         /*
2496                          * the filesystem may choose to bump up nr_to_write.
2497                          * We have to make sure to honor the new nr_to_write
2498                          * at any time
2499                          */
2500                         nr_to_write_done = wbc->nr_to_write <= 0;
2501                 }
2502                 pagevec_release(&pvec);
2503                 cond_resched();
2504         }
2505         if (!scanned && !done) {
2506                 /*
2507                  * We hit the last page and there is more work to be done: wrap
2508                  * back to the start of the file
2509                  */
2510                 scanned = 1;
2511                 index = 0;
2512                 goto retry;
2513         }
2514         return ret;
2515 }
2516
2517 static void flush_epd_write_bio(struct extent_page_data *epd)
2518 {
2519         if (epd->bio) {
2520                 if (epd->sync_io)
2521                         submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2522                 else
2523                         submit_one_bio(WRITE, epd->bio, 0, 0);
2524                 epd->bio = NULL;
2525         }
2526 }
2527
2528 static noinline void flush_write_bio(void *data)
2529 {
2530         struct extent_page_data *epd = data;
2531         flush_epd_write_bio(epd);
2532 }
2533
2534 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2535                           get_extent_t *get_extent,
2536                           struct writeback_control *wbc)
2537 {
2538         int ret;
2539         struct address_space *mapping = page->mapping;
2540         struct extent_page_data epd = {
2541                 .bio = NULL,
2542                 .tree = tree,
2543                 .get_extent = get_extent,
2544                 .extent_locked = 0,
2545                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2546         };
2547         struct writeback_control wbc_writepages = {
2548                 .sync_mode      = wbc->sync_mode,
2549                 .older_than_this = NULL,
2550                 .nr_to_write    = 64,
2551                 .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
2552                 .range_end      = (loff_t)-1,
2553         };
2554
2555         ret = __extent_writepage(page, wbc, &epd);
2556
2557         extent_write_cache_pages(tree, mapping, &wbc_writepages,
2558                                  __extent_writepage, &epd, flush_write_bio);
2559         flush_epd_write_bio(&epd);
2560         return ret;
2561 }
2562
2563 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2564                               u64 start, u64 end, get_extent_t *get_extent,
2565                               int mode)
2566 {
2567         int ret = 0;
2568         struct address_space *mapping = inode->i_mapping;
2569         struct page *page;
2570         unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2571                 PAGE_CACHE_SHIFT;
2572
2573         struct extent_page_data epd = {
2574                 .bio = NULL,
2575                 .tree = tree,
2576                 .get_extent = get_extent,
2577                 .extent_locked = 1,
2578                 .sync_io = mode == WB_SYNC_ALL,
2579         };
2580         struct writeback_control wbc_writepages = {
2581                 .sync_mode      = mode,
2582                 .older_than_this = NULL,
2583                 .nr_to_write    = nr_pages * 2,
2584                 .range_start    = start,
2585                 .range_end      = end + 1,
2586         };
2587
2588         while (start <= end) {
2589                 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2590                 if (clear_page_dirty_for_io(page))
2591                         ret = __extent_writepage(page, &wbc_writepages, &epd);
2592                 else {
2593                         if (tree->ops && tree->ops->writepage_end_io_hook)
2594                                 tree->ops->writepage_end_io_hook(page, start,
2595                                                  start + PAGE_CACHE_SIZE - 1,
2596                                                  NULL, 1);
2597                         unlock_page(page);
2598                 }
2599                 page_cache_release(page);
2600                 start += PAGE_CACHE_SIZE;
2601         }
2602
2603         flush_epd_write_bio(&epd);
2604         return ret;
2605 }
2606
2607 int extent_writepages(struct extent_io_tree *tree,
2608                       struct address_space *mapping,
2609                       get_extent_t *get_extent,
2610                       struct writeback_control *wbc)
2611 {
2612         int ret = 0;
2613         struct extent_page_data epd = {
2614                 .bio = NULL,
2615                 .tree = tree,
2616                 .get_extent = get_extent,
2617                 .extent_locked = 0,
2618                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2619         };
2620
2621         ret = extent_write_cache_pages(tree, mapping, wbc,
2622                                        __extent_writepage, &epd,
2623                                        flush_write_bio);
2624         flush_epd_write_bio(&epd);
2625         return ret;
2626 }
2627
2628 int extent_readpages(struct extent_io_tree *tree,
2629                      struct address_space *mapping,
2630                      struct list_head *pages, unsigned nr_pages,
2631                      get_extent_t get_extent)
2632 {
2633         struct bio *bio = NULL;
2634         unsigned page_idx;
2635         unsigned long bio_flags = 0;
2636
2637         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2638                 struct page *page = list_entry(pages->prev, struct page, lru);
2639
2640                 prefetchw(&page->flags);
2641                 list_del(&page->lru);
2642                 if (!add_to_page_cache_lru(page, mapping,
2643                                         page->index, GFP_NOFS)) {
2644                         __extent_read_full_page(tree, page, get_extent,
2645                                                 &bio, 0, &bio_flags);
2646                 }
2647                 page_cache_release(page);
2648         }
2649         BUG_ON(!list_empty(pages));
2650         if (bio)
2651                 submit_one_bio(READ, bio, 0, bio_flags);
2652         return 0;
2653 }
2654
2655 /*
2656  * basic invalidatepage code, this waits on any locked or writeback
2657  * ranges corresponding to the page, and then deletes any extent state
2658  * records from the tree
2659  */
2660 int extent_invalidatepage(struct extent_io_tree *tree,
2661                           struct page *page, unsigned long offset)
2662 {
2663         struct extent_state *cached_state = NULL;
2664         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2665         u64 end = start + PAGE_CACHE_SIZE - 1;
2666         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2667
2668         start += (offset + blocksize - 1) & ~(blocksize - 1);
2669         if (start > end)
2670                 return 0;
2671
2672         lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2673         wait_on_page_writeback(page);
2674         clear_extent_bit(tree, start, end,
2675                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2676                          EXTENT_DO_ACCOUNTING,
2677                          1, 1, &cached_state, GFP_NOFS);
2678         return 0;
2679 }
2680
2681 /*
2682  * a helper for releasepage, this tests for areas of the page that
2683  * are locked or under IO and drops the related state bits if it is safe
2684  * to drop the page.
2685  */
2686 int try_release_extent_state(struct extent_map_tree *map,
2687                              struct extent_io_tree *tree, struct page *page,
2688                              gfp_t mask)
2689 {
2690         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2691         u64 end = start + PAGE_CACHE_SIZE - 1;
2692         int ret = 1;
2693
2694         if (test_range_bit(tree, start, end,
2695                            EXTENT_IOBITS, 0, NULL))
2696                 ret = 0;
2697         else {
2698                 if ((mask & GFP_NOFS) == GFP_NOFS)
2699                         mask = GFP_NOFS;
2700                 /*
2701                  * at this point we can safely clear everything except the
2702                  * locked bit and the nodatasum bit
2703                  */
2704                 ret = clear_extent_bit(tree, start, end,
2705                                  ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2706                                  0, 0, NULL, mask);
2707
2708                 /* if clear_extent_bit failed for enomem reasons,
2709                  * we can't allow the release to continue.
2710                  */
2711                 if (ret < 0)
2712                         ret = 0;
2713                 else
2714                         ret = 1;
2715         }
2716         return ret;
2717 }
2718
2719 /*
2720  * a helper for releasepage.  As long as there are no locked extents
2721  * in the range corresponding to the page, both state records and extent
2722  * map records are removed
2723  */
2724 int try_release_extent_mapping(struct extent_map_tree *map,
2725                                struct extent_io_tree *tree, struct page *page,
2726                                gfp_t mask)
2727 {
2728         struct extent_map *em;
2729         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2730         u64 end = start + PAGE_CACHE_SIZE - 1;
2731
2732         if ((mask & __GFP_WAIT) &&
2733             page->mapping->host->i_size > 16 * 1024 * 1024) {
2734                 u64 len;
2735                 while (start <= end) {
2736                         len = end - start + 1;
2737                         write_lock(&map->lock);
2738                         em = lookup_extent_mapping(map, start, len);
2739                         if (IS_ERR_OR_NULL(em)) {
2740                                 write_unlock(&map->lock);
2741                                 break;
2742                         }
2743                         if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2744                             em->start != start) {
2745                                 write_unlock(&map->lock);
2746                                 free_extent_map(em);
2747                                 break;
2748                         }
2749                         if (!test_range_bit(tree, em->start,
2750                                             extent_map_end(em) - 1,
2751                                             EXTENT_LOCKED | EXTENT_WRITEBACK,
2752                                             0, NULL)) {
2753                                 remove_extent_mapping(map, em);
2754                                 /* once for the rb tree */
2755                                 free_extent_map(em);
2756                         }
2757                         start = extent_map_end(em);
2758                         write_unlock(&map->lock);
2759
2760                         /* once for us */
2761                         free_extent_map(em);
2762                 }
2763         }
2764         return try_release_extent_state(map, tree, page, mask);
2765 }
2766
2767 /*
2768  * helper function for fiemap, which doesn't want to see any holes.
2769  * This maps until we find something past 'last'
2770  */
2771 static struct extent_map *get_extent_skip_holes(struct inode *inode,
2772                                                 u64 offset,
2773                                                 u64 last,
2774                                                 get_extent_t *get_extent)
2775 {
2776         u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
2777         struct extent_map *em;
2778         u64 len;
2779
2780         if (offset >= last)
2781                 return NULL;
2782
2783         while(1) {
2784                 len = last - offset;
2785                 if (len == 0)
2786                         break;
2787                 len = (len + sectorsize - 1) & ~(sectorsize - 1);
2788                 em = get_extent(inode, NULL, 0, offset, len, 0);
2789                 if (IS_ERR_OR_NULL(em))
2790                         return em;
2791
2792                 /* if this isn't a hole return it */
2793                 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
2794                     em->block_start != EXTENT_MAP_HOLE) {
2795                         return em;
2796                 }
2797
2798                 /* this is a hole, advance to the next extent */
2799                 offset = extent_map_end(em);
2800                 free_extent_map(em);
2801                 if (offset >= last)
2802                         break;
2803         }
2804         return NULL;
2805 }
2806
2807 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2808                 __u64 start, __u64 len, get_extent_t *get_extent)
2809 {
2810         int ret = 0;
2811         u64 off = start;
2812         u64 max = start + len;
2813         u32 flags = 0;
2814         u32 found_type;
2815         u64 last;
2816         u64 last_for_get_extent = 0;
2817         u64 disko = 0;
2818         u64 isize = i_size_read(inode);
2819         struct btrfs_key found_key;
2820         struct extent_map *em = NULL;
2821         struct extent_state *cached_state = NULL;
2822         struct btrfs_path *path;
2823         struct btrfs_file_extent_item *item;
2824         int end = 0;
2825         u64 em_start = 0;
2826         u64 em_len = 0;
2827         u64 em_end = 0;
2828         unsigned long emflags;
2829
2830         if (len == 0)
2831                 return -EINVAL;
2832
2833         path = btrfs_alloc_path();
2834         if (!path)
2835                 return -ENOMEM;
2836         path->leave_spinning = 1;
2837
2838         /*
2839          * lookup the last file extent.  We're not using i_size here
2840          * because there might be preallocation past i_size
2841          */
2842         ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
2843                                        path, btrfs_ino(inode), -1, 0);
2844         if (ret < 0) {
2845                 btrfs_free_path(path);
2846                 return ret;
2847         }
2848         WARN_ON(!ret);
2849         path->slots[0]--;
2850         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2851                               struct btrfs_file_extent_item);
2852         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
2853         found_type = btrfs_key_type(&found_key);
2854
2855         /* No extents, but there might be delalloc bits */
2856         if (found_key.objectid != btrfs_ino(inode) ||
2857             found_type != BTRFS_EXTENT_DATA_KEY) {
2858                 /* have to trust i_size as the end */
2859                 last = (u64)-1;
2860                 last_for_get_extent = isize;
2861         } else {
2862                 /*
2863                  * remember the start of the last extent.  There are a
2864                  * bunch of different factors that go into the length of the
2865                  * extent, so its much less complex to remember where it started
2866                  */
2867                 last = found_key.offset;
2868                 last_for_get_extent = last + 1;
2869         }
2870         btrfs_free_path(path);
2871
2872         /*
2873          * we might have some extents allocated but more delalloc past those
2874          * extents.  so, we trust isize unless the start of the last extent is
2875          * beyond isize
2876          */
2877         if (last < isize) {
2878                 last = (u64)-1;
2879                 last_for_get_extent = isize;
2880         }
2881
2882         lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2883                          &cached_state, GFP_NOFS);
2884
2885         em = get_extent_skip_holes(inode, off, last_for_get_extent,
2886                                    get_extent);
2887         if (!em)
2888                 goto out;
2889         if (IS_ERR(em)) {
2890                 ret = PTR_ERR(em);
2891                 goto out;
2892         }
2893
2894         while (!end) {
2895                 u64 offset_in_extent;
2896
2897                 /* break if the extent we found is outside the range */
2898                 if (em->start >= max || extent_map_end(em) < off)
2899                         break;
2900
2901                 /*
2902                  * get_extent may return an extent that starts before our
2903                  * requested range.  We have to make sure the ranges
2904                  * we return to fiemap always move forward and don't
2905                  * overlap, so adjust the offsets here
2906                  */
2907                 em_start = max(em->start, off);
2908
2909                 /*
2910                  * record the offset from the start of the extent
2911                  * for adjusting the disk offset below
2912                  */
2913                 offset_in_extent = em_start - em->start;
2914                 em_end = extent_map_end(em);
2915                 em_len = em_end - em_start;
2916                 emflags = em->flags;
2917                 disko = 0;
2918                 flags = 0;
2919
2920                 /*
2921                  * bump off for our next call to get_extent
2922                  */
2923                 off = extent_map_end(em);
2924                 if (off >= max)
2925                         end = 1;
2926
2927                 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2928                         end = 1;
2929                         flags |= FIEMAP_EXTENT_LAST;
2930                 } else if (em->block_start == EXTENT_MAP_INLINE) {
2931                         flags |= (FIEMAP_EXTENT_DATA_INLINE |
2932                                   FIEMAP_EXTENT_NOT_ALIGNED);
2933                 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
2934                         flags |= (FIEMAP_EXTENT_DELALLOC |
2935                                   FIEMAP_EXTENT_UNKNOWN);
2936                 } else {
2937                         disko = em->block_start + offset_in_extent;
2938                 }
2939                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2940                         flags |= FIEMAP_EXTENT_ENCODED;
2941
2942                 free_extent_map(em);
2943                 em = NULL;
2944                 if ((em_start >= last) || em_len == (u64)-1 ||
2945                    (last == (u64)-1 && isize <= em_end)) {
2946                         flags |= FIEMAP_EXTENT_LAST;
2947                         end = 1;
2948                 }
2949
2950                 /* now scan forward to see if this is really the last extent. */
2951                 em = get_extent_skip_holes(inode, off, last_for_get_extent,
2952                                            get_extent);
2953                 if (IS_ERR(em)) {
2954                         ret = PTR_ERR(em);
2955                         goto out;
2956                 }
2957                 if (!em) {
2958                         flags |= FIEMAP_EXTENT_LAST;
2959                         end = 1;
2960                 }
2961                 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2962                                               em_len, flags);
2963                 if (ret)
2964                         goto out_free;
2965         }
2966 out_free:
2967         free_extent_map(em);
2968 out:
2969         unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
2970                              &cached_state, GFP_NOFS);
2971         return ret;
2972 }
2973
2974 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2975                                               unsigned long i)
2976 {
2977         struct page *p;
2978         struct address_space *mapping;
2979
2980         if (i == 0)
2981                 return eb->first_page;
2982         i += eb->start >> PAGE_CACHE_SHIFT;
2983         mapping = eb->first_page->mapping;
2984         if (!mapping)
2985                 return NULL;
2986
2987         /*
2988          * extent_buffer_page is only called after pinning the page
2989          * by increasing the reference count.  So we know the page must
2990          * be in the radix tree.
2991          */
2992         rcu_read_lock();
2993         p = radix_tree_lookup(&mapping->page_tree, i);
2994         rcu_read_unlock();
2995
2996         return p;
2997 }
2998
2999 static inline unsigned long num_extent_pages(u64 start, u64 len)
3000 {
3001         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3002                 (start >> PAGE_CACHE_SHIFT);
3003 }
3004
3005 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3006                                                    u64 start,
3007                                                    unsigned long len,
3008                                                    gfp_t mask)
3009 {
3010         struct extent_buffer *eb = NULL;
3011 #if LEAK_DEBUG
3012         unsigned long flags;
3013 #endif
3014
3015         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3016         if (eb == NULL)
3017                 return NULL;
3018         eb->start = start;
3019         eb->len = len;
3020         spin_lock_init(&eb->lock);
3021         init_waitqueue_head(&eb->lock_wq);
3022
3023 #if LEAK_DEBUG
3024         spin_lock_irqsave(&leak_lock, flags);
3025         list_add(&eb->leak_list, &buffers);
3026         spin_unlock_irqrestore(&leak_lock, flags);
3027 #endif
3028         atomic_set(&eb->refs, 1);
3029
3030         return eb;
3031 }
3032
3033 static void __free_extent_buffer(struct extent_buffer *eb)
3034 {
3035 #if LEAK_DEBUG
3036         unsigned long flags;
3037         spin_lock_irqsave(&leak_lock, flags);
3038         list_del(&eb->leak_list);
3039         spin_unlock_irqrestore(&leak_lock, flags);
3040 #endif
3041         kmem_cache_free(extent_buffer_cache, eb);
3042 }
3043
3044 /*
3045  * Helper for releasing extent buffer page.
3046  */
3047 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3048                                                 unsigned long start_idx)
3049 {
3050         unsigned long index;
3051         struct page *page;
3052
3053         if (!eb->first_page)
3054                 return;
3055