Btrfs: fix how we merge extent states and deal with cached states
[pandora-kernel.git] / fs / btrfs / extent_io.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/pagemap.h>
6 #include <linux/page-flags.h>
7 #include <linux/module.h>
8 #include <linux/spinlock.h>
9 #include <linux/blkdev.h>
10 #include <linux/swap.h>
11 #include <linux/writeback.h>
12 #include <linux/pagevec.h>
13 #include "extent_io.h"
14 #include "extent_map.h"
15 #include "compat.h"
16 #include "ctree.h"
17 #include "btrfs_inode.h"
18
19 static struct kmem_cache *extent_state_cache;
20 static struct kmem_cache *extent_buffer_cache;
21
22 static LIST_HEAD(buffers);
23 static LIST_HEAD(states);
24
25 #define LEAK_DEBUG 0
26 #if LEAK_DEBUG
27 static DEFINE_SPINLOCK(leak_lock);
28 #endif
29
30 #define BUFFER_LRU_MAX 64
31
32 struct tree_entry {
33         u64 start;
34         u64 end;
35         struct rb_node rb_node;
36 };
37
38 struct extent_page_data {
39         struct bio *bio;
40         struct extent_io_tree *tree;
41         get_extent_t *get_extent;
42
43         /* tells writepage not to lock the state bits for this range
44          * it still does the unlocking
45          */
46         unsigned int extent_locked:1;
47
48         /* tells the submit_bio code to use a WRITE_SYNC */
49         unsigned int sync_io:1;
50 };
51
52 int __init extent_io_init(void)
53 {
54         extent_state_cache = kmem_cache_create("extent_state",
55                         sizeof(struct extent_state), 0,
56                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
57         if (!extent_state_cache)
58                 return -ENOMEM;
59
60         extent_buffer_cache = kmem_cache_create("extent_buffers",
61                         sizeof(struct extent_buffer), 0,
62                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
63         if (!extent_buffer_cache)
64                 goto free_state_cache;
65         return 0;
66
67 free_state_cache:
68         kmem_cache_destroy(extent_state_cache);
69         return -ENOMEM;
70 }
71
72 void extent_io_exit(void)
73 {
74         struct extent_state *state;
75         struct extent_buffer *eb;
76
77         while (!list_empty(&states)) {
78                 state = list_entry(states.next, struct extent_state, leak_list);
79                 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
80                        "state %lu in tree %p refs %d\n",
81                        (unsigned long long)state->start,
82                        (unsigned long long)state->end,
83                        state->state, state->tree, atomic_read(&state->refs));
84                 list_del(&state->leak_list);
85                 kmem_cache_free(extent_state_cache, state);
86
87         }
88
89         while (!list_empty(&buffers)) {
90                 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
91                 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
92                        "refs %d\n", (unsigned long long)eb->start,
93                        eb->len, atomic_read(&eb->refs));
94                 list_del(&eb->leak_list);
95                 kmem_cache_free(extent_buffer_cache, eb);
96         }
97         if (extent_state_cache)
98                 kmem_cache_destroy(extent_state_cache);
99         if (extent_buffer_cache)
100                 kmem_cache_destroy(extent_buffer_cache);
101 }
102
103 void extent_io_tree_init(struct extent_io_tree *tree,
104                          struct address_space *mapping)
105 {
106         tree->state = RB_ROOT;
107         INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
108         tree->ops = NULL;
109         tree->dirty_bytes = 0;
110         spin_lock_init(&tree->lock);
111         spin_lock_init(&tree->buffer_lock);
112         tree->mapping = mapping;
113 }
114
115 static struct extent_state *alloc_extent_state(gfp_t mask)
116 {
117         struct extent_state *state;
118 #if LEAK_DEBUG
119         unsigned long flags;
120 #endif
121
122         state = kmem_cache_alloc(extent_state_cache, mask);
123         if (!state)
124                 return state;
125         state->state = 0;
126         state->private = 0;
127         state->tree = NULL;
128 #if LEAK_DEBUG
129         spin_lock_irqsave(&leak_lock, flags);
130         list_add(&state->leak_list, &states);
131         spin_unlock_irqrestore(&leak_lock, flags);
132 #endif
133         atomic_set(&state->refs, 1);
134         init_waitqueue_head(&state->wq);
135         return state;
136 }
137
138 void free_extent_state(struct extent_state *state)
139 {
140         if (!state)
141                 return;
142         if (atomic_dec_and_test(&state->refs)) {
143 #if LEAK_DEBUG
144                 unsigned long flags;
145 #endif
146                 WARN_ON(state->tree);
147 #if LEAK_DEBUG
148                 spin_lock_irqsave(&leak_lock, flags);
149                 list_del(&state->leak_list);
150                 spin_unlock_irqrestore(&leak_lock, flags);
151 #endif
152                 kmem_cache_free(extent_state_cache, state);
153         }
154 }
155
156 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
157                                    struct rb_node *node)
158 {
159         struct rb_node **p = &root->rb_node;
160         struct rb_node *parent = NULL;
161         struct tree_entry *entry;
162
163         while (*p) {
164                 parent = *p;
165                 entry = rb_entry(parent, struct tree_entry, rb_node);
166
167                 if (offset < entry->start)
168                         p = &(*p)->rb_left;
169                 else if (offset > entry->end)
170                         p = &(*p)->rb_right;
171                 else
172                         return parent;
173         }
174
175         entry = rb_entry(node, struct tree_entry, rb_node);
176         rb_link_node(node, parent, p);
177         rb_insert_color(node, root);
178         return NULL;
179 }
180
181 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
182                                      struct rb_node **prev_ret,
183                                      struct rb_node **next_ret)
184 {
185         struct rb_root *root = &tree->state;
186         struct rb_node *n = root->rb_node;
187         struct rb_node *prev = NULL;
188         struct rb_node *orig_prev = NULL;
189         struct tree_entry *entry;
190         struct tree_entry *prev_entry = NULL;
191
192         while (n) {
193                 entry = rb_entry(n, struct tree_entry, rb_node);
194                 prev = n;
195                 prev_entry = entry;
196
197                 if (offset < entry->start)
198                         n = n->rb_left;
199                 else if (offset > entry->end)
200                         n = n->rb_right;
201                 else
202                         return n;
203         }
204
205         if (prev_ret) {
206                 orig_prev = prev;
207                 while (prev && offset > prev_entry->end) {
208                         prev = rb_next(prev);
209                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
210                 }
211                 *prev_ret = prev;
212                 prev = orig_prev;
213         }
214
215         if (next_ret) {
216                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
217                 while (prev && offset < prev_entry->start) {
218                         prev = rb_prev(prev);
219                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
220                 }
221                 *next_ret = prev;
222         }
223         return NULL;
224 }
225
226 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
227                                           u64 offset)
228 {
229         struct rb_node *prev = NULL;
230         struct rb_node *ret;
231
232         ret = __etree_search(tree, offset, &prev, NULL);
233         if (!ret)
234                 return prev;
235         return ret;
236 }
237
238 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
239                      struct extent_state *other)
240 {
241         if (tree->ops && tree->ops->merge_extent_hook)
242                 tree->ops->merge_extent_hook(tree->mapping->host, new,
243                                              other);
244 }
245
246 /*
247  * utility function to look for merge candidates inside a given range.
248  * Any extents with matching state are merged together into a single
249  * extent in the tree.  Extents with EXTENT_IO in their state field
250  * are not merged because the end_io handlers need to be able to do
251  * operations on them without sleeping (or doing allocations/splits).
252  *
253  * This should be called with the tree lock held.
254  */
255 static int merge_state(struct extent_io_tree *tree,
256                        struct extent_state *state)
257 {
258         struct extent_state *other;
259         struct rb_node *other_node;
260
261         if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
262                 return 0;
263
264         other_node = rb_prev(&state->rb_node);
265         if (other_node) {
266                 other = rb_entry(other_node, struct extent_state, rb_node);
267                 if (other->end == state->start - 1 &&
268                     other->state == state->state) {
269                         merge_cb(tree, state, other);
270                         state->start = other->start;
271                         other->tree = NULL;
272                         rb_erase(&other->rb_node, &tree->state);
273                         free_extent_state(other);
274                 }
275         }
276         other_node = rb_next(&state->rb_node);
277         if (other_node) {
278                 other = rb_entry(other_node, struct extent_state, rb_node);
279                 if (other->start == state->end + 1 &&
280                     other->state == state->state) {
281                         merge_cb(tree, state, other);
282                         state->end = other->end;
283                         other->tree = NULL;
284                         rb_erase(&other->rb_node, &tree->state);
285                         free_extent_state(other);
286                 }
287         }
288
289         return 0;
290 }
291
292 static int set_state_cb(struct extent_io_tree *tree,
293                          struct extent_state *state, int *bits)
294 {
295         if (tree->ops && tree->ops->set_bit_hook) {
296                 return tree->ops->set_bit_hook(tree->mapping->host,
297                                                state, bits);
298         }
299
300         return 0;
301 }
302
303 static void clear_state_cb(struct extent_io_tree *tree,
304                            struct extent_state *state, int *bits)
305 {
306         if (tree->ops && tree->ops->clear_bit_hook)
307                 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
308 }
309
310 /*
311  * insert an extent_state struct into the tree.  'bits' are set on the
312  * struct before it is inserted.
313  *
314  * This may return -EEXIST if the extent is already there, in which case the
315  * state struct is freed.
316  *
317  * The tree lock is not taken internally.  This is a utility function and
318  * probably isn't what you want to call (see set/clear_extent_bit).
319  */
320 static int insert_state(struct extent_io_tree *tree,
321                         struct extent_state *state, u64 start, u64 end,
322                         int *bits)
323 {
324         struct rb_node *node;
325         int bits_to_set = *bits & ~EXTENT_CTLBITS;
326         int ret;
327
328         if (end < start) {
329                 printk(KERN_ERR "btrfs end < start %llu %llu\n",
330                        (unsigned long long)end,
331                        (unsigned long long)start);
332                 WARN_ON(1);
333         }
334         state->start = start;
335         state->end = end;
336         ret = set_state_cb(tree, state, bits);
337         if (ret)
338                 return ret;
339
340         if (bits_to_set & EXTENT_DIRTY)
341                 tree->dirty_bytes += end - start + 1;
342         state->state |= bits_to_set;
343         node = tree_insert(&tree->state, end, &state->rb_node);
344         if (node) {
345                 struct extent_state *found;
346                 found = rb_entry(node, struct extent_state, rb_node);
347                 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
348                        "%llu %llu\n", (unsigned long long)found->start,
349                        (unsigned long long)found->end,
350                        (unsigned long long)start, (unsigned long long)end);
351                 return -EEXIST;
352         }
353         state->tree = tree;
354         merge_state(tree, state);
355         return 0;
356 }
357
358 static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
359                      u64 split)
360 {
361         if (tree->ops && tree->ops->split_extent_hook)
362                 return tree->ops->split_extent_hook(tree->mapping->host,
363                                                     orig, split);
364         return 0;
365 }
366
367 /*
368  * split a given extent state struct in two, inserting the preallocated
369  * struct 'prealloc' as the newly created second half.  'split' indicates an
370  * offset inside 'orig' where it should be split.
371  *
372  * Before calling,
373  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
374  * are two extent state structs in the tree:
375  * prealloc: [orig->start, split - 1]
376  * orig: [ split, orig->end ]
377  *
378  * The tree locks are not taken by this function. They need to be held
379  * by the caller.
380  */
381 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
382                        struct extent_state *prealloc, u64 split)
383 {
384         struct rb_node *node;
385
386         split_cb(tree, orig, split);
387
388         prealloc->start = orig->start;
389         prealloc->end = split - 1;
390         prealloc->state = orig->state;
391         orig->start = split;
392
393         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
394         if (node) {
395                 free_extent_state(prealloc);
396                 return -EEXIST;
397         }
398         prealloc->tree = tree;
399         return 0;
400 }
401
402 /*
403  * utility function to clear some bits in an extent state struct.
404  * it will optionally wake up any one waiting on this state (wake == 1), or
405  * forcibly remove the state from the tree (delete == 1).
406  *
407  * If no bits are set on the state struct after clearing things, the
408  * struct is freed and removed from the tree
409  */
410 static int clear_state_bit(struct extent_io_tree *tree,
411                             struct extent_state *state,
412                             int *bits, int wake)
413 {
414         int bits_to_clear = *bits & ~EXTENT_CTLBITS;
415         int ret = state->state & bits_to_clear;
416
417         if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
418                 u64 range = state->end - state->start + 1;
419                 WARN_ON(range > tree->dirty_bytes);
420                 tree->dirty_bytes -= range;
421         }
422         clear_state_cb(tree, state, bits);
423         state->state &= ~bits_to_clear;
424         if (wake)
425                 wake_up(&state->wq);
426         if (state->state == 0) {
427                 if (state->tree) {
428                         rb_erase(&state->rb_node, &tree->state);
429                         state->tree = NULL;
430                         free_extent_state(state);
431                 } else {
432                         WARN_ON(1);
433                 }
434         } else {
435                 merge_state(tree, state);
436         }
437         return ret;
438 }
439
440 static struct extent_state *
441 alloc_extent_state_atomic(struct extent_state *prealloc)
442 {
443         if (!prealloc)
444                 prealloc = alloc_extent_state(GFP_ATOMIC);
445
446         return prealloc;
447 }
448
449 /*
450  * clear some bits on a range in the tree.  This may require splitting
451  * or inserting elements in the tree, so the gfp mask is used to
452  * indicate which allocations or sleeping are allowed.
453  *
454  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
455  * the given range from the tree regardless of state (ie for truncate).
456  *
457  * the range [start, end] is inclusive.
458  *
459  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
460  * bits were already set, or zero if none of the bits were already set.
461  */
462 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
463                      int bits, int wake, int delete,
464                      struct extent_state **cached_state,
465                      gfp_t mask)
466 {
467         struct extent_state *state;
468         struct extent_state *cached;
469         struct extent_state *prealloc = NULL;
470         struct rb_node *next_node;
471         struct rb_node *node;
472         u64 last_end;
473         int err;
474         int set = 0;
475         int clear = 0;
476
477         if (delete)
478                 bits |= ~EXTENT_CTLBITS;
479         bits |= EXTENT_FIRST_DELALLOC;
480
481         if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
482                 clear = 1;
483 again:
484         if (!prealloc && (mask & __GFP_WAIT)) {
485                 prealloc = alloc_extent_state(mask);
486                 if (!prealloc)
487                         return -ENOMEM;
488         }
489
490         spin_lock(&tree->lock);
491         if (cached_state) {
492                 cached = *cached_state;
493
494                 if (clear) {
495                         *cached_state = NULL;
496                         cached_state = NULL;
497                 }
498
499                 if (cached && cached->tree && cached->start <= start &&
500                     cached->end > start) {
501                         if (clear)
502                                 atomic_dec(&cached->refs);
503                         state = cached;
504                         goto hit_next;
505                 }
506                 if (clear)
507                         free_extent_state(cached);
508         }
509         /*
510          * this search will find the extents that end after
511          * our range starts
512          */
513         node = tree_search(tree, start);
514         if (!node)
515                 goto out;
516         state = rb_entry(node, struct extent_state, rb_node);
517 hit_next:
518         if (state->start > end)
519                 goto out;
520         WARN_ON(state->end < start);
521         last_end = state->end;
522
523         /*
524          *     | ---- desired range ---- |
525          *  | state | or
526          *  | ------------- state -------------- |
527          *
528          * We need to split the extent we found, and may flip
529          * bits on second half.
530          *
531          * If the extent we found extends past our range, we
532          * just split and search again.  It'll get split again
533          * the next time though.
534          *
535          * If the extent we found is inside our range, we clear
536          * the desired bit on it.
537          */
538
539         if (state->start < start) {
540                 prealloc = alloc_extent_state_atomic(prealloc);
541                 BUG_ON(!prealloc);
542                 err = split_state(tree, state, prealloc, start);
543                 BUG_ON(err == -EEXIST);
544                 prealloc = NULL;
545                 if (err)
546                         goto out;
547                 if (state->end <= end) {
548                         set |= clear_state_bit(tree, state, &bits, wake);
549                         if (last_end == (u64)-1)
550                                 goto out;
551                         start = last_end + 1;
552                 }
553                 goto search_again;
554         }
555         /*
556          * | ---- desired range ---- |
557          *                        | state |
558          * We need to split the extent, and clear the bit
559          * on the first half
560          */
561         if (state->start <= end && state->end > end) {
562                 prealloc = alloc_extent_state_atomic(prealloc);
563                 BUG_ON(!prealloc);
564                 err = split_state(tree, state, prealloc, end + 1);
565                 BUG_ON(err == -EEXIST);
566                 if (wake)
567                         wake_up(&state->wq);
568
569                 set |= clear_state_bit(tree, prealloc, &bits, wake);
570
571                 prealloc = NULL;
572                 goto out;
573         }
574
575         if (state->end < end && prealloc && !need_resched())
576                 next_node = rb_next(&state->rb_node);
577         else
578                 next_node = NULL;
579
580         set |= clear_state_bit(tree, state, &bits, wake);
581         if (last_end == (u64)-1)
582                 goto out;
583         start = last_end + 1;
584         if (start <= end && next_node) {
585                 state = rb_entry(next_node, struct extent_state,
586                                  rb_node);
587                 if (state->start == start)
588                         goto hit_next;
589         }
590         goto search_again;
591
592 out:
593         spin_unlock(&tree->lock);
594         if (prealloc)
595                 free_extent_state(prealloc);
596
597         return set;
598
599 search_again:
600         if (start > end)
601                 goto out;
602         spin_unlock(&tree->lock);
603         if (mask & __GFP_WAIT)
604                 cond_resched();
605         goto again;
606 }
607
608 static int wait_on_state(struct extent_io_tree *tree,
609                          struct extent_state *state)
610                 __releases(tree->lock)
611                 __acquires(tree->lock)
612 {
613         DEFINE_WAIT(wait);
614         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
615         spin_unlock(&tree->lock);
616         schedule();
617         spin_lock(&tree->lock);
618         finish_wait(&state->wq, &wait);
619         return 0;
620 }
621
622 /*
623  * waits for one or more bits to clear on a range in the state tree.
624  * The range [start, end] is inclusive.
625  * The tree lock is taken by this function
626  */
627 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
628 {
629         struct extent_state *state;
630         struct rb_node *node;
631
632         spin_lock(&tree->lock);
633 again:
634         while (1) {
635                 /*
636                  * this search will find all the extents that end after
637                  * our range starts
638                  */
639                 node = tree_search(tree, start);
640                 if (!node)
641                         break;
642
643                 state = rb_entry(node, struct extent_state, rb_node);
644
645                 if (state->start > end)
646                         goto out;
647
648                 if (state->state & bits) {
649                         start = state->start;
650                         atomic_inc(&state->refs);
651                         wait_on_state(tree, state);
652                         free_extent_state(state);
653                         goto again;
654                 }
655                 start = state->end + 1;
656
657                 if (start > end)
658                         break;
659
660                 if (need_resched()) {
661                         spin_unlock(&tree->lock);
662                         cond_resched();
663                         spin_lock(&tree->lock);
664                 }
665         }
666 out:
667         spin_unlock(&tree->lock);
668         return 0;
669 }
670
671 static int set_state_bits(struct extent_io_tree *tree,
672                            struct extent_state *state,
673                            int *bits)
674 {
675         int ret;
676         int bits_to_set = *bits & ~EXTENT_CTLBITS;
677
678         ret = set_state_cb(tree, state, bits);
679         if (ret)
680                 return ret;
681         if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
682                 u64 range = state->end - state->start + 1;
683                 tree->dirty_bytes += range;
684         }
685         state->state |= bits_to_set;
686
687         return 0;
688 }
689
690 static void cache_state(struct extent_state *state,
691                         struct extent_state **cached_ptr)
692 {
693         if (cached_ptr && !(*cached_ptr)) {
694                 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
695                         *cached_ptr = state;
696                         atomic_inc(&state->refs);
697                 }
698         }
699 }
700
701 static void uncache_state(struct extent_state **cached_ptr)
702 {
703         if (cached_ptr && (*cached_ptr)) {
704                 struct extent_state *state = *cached_ptr;
705                 *cached_ptr = NULL;
706                 free_extent_state(state);
707         }
708 }
709
710 /*
711  * set some bits on a range in the tree.  This may require allocations or
712  * sleeping, so the gfp mask is used to indicate what is allowed.
713  *
714  * If any of the exclusive bits are set, this will fail with -EEXIST if some
715  * part of the range already has the desired bits set.  The start of the
716  * existing range is returned in failed_start in this case.
717  *
718  * [start, end] is inclusive This takes the tree lock.
719  */
720
721 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
722                    int bits, int exclusive_bits, u64 *failed_start,
723                    struct extent_state **cached_state, gfp_t mask)
724 {
725         struct extent_state *state;
726         struct extent_state *prealloc = NULL;
727         struct rb_node *node;
728         int err = 0;
729         u64 last_start;
730         u64 last_end;
731
732         bits |= EXTENT_FIRST_DELALLOC;
733 again:
734         if (!prealloc && (mask & __GFP_WAIT)) {
735                 prealloc = alloc_extent_state(mask);
736                 BUG_ON(!prealloc);
737         }
738
739         spin_lock(&tree->lock);
740         if (cached_state && *cached_state) {
741                 state = *cached_state;
742                 if (state->start <= start && state->end > start &&
743                     state->tree) {
744                         node = &state->rb_node;
745                         goto hit_next;
746                 }
747         }
748         /*
749          * this search will find all the extents that end after
750          * our range starts.
751          */
752         node = tree_search(tree, start);
753         if (!node) {
754                 prealloc = alloc_extent_state_atomic(prealloc);
755                 BUG_ON(!prealloc);
756                 err = insert_state(tree, prealloc, start, end, &bits);
757                 prealloc = NULL;
758                 BUG_ON(err == -EEXIST);
759                 goto out;
760         }
761         state = rb_entry(node, struct extent_state, rb_node);
762 hit_next:
763         last_start = state->start;
764         last_end = state->end;
765
766         /*
767          * | ---- desired range ---- |
768          * | state |
769          *
770          * Just lock what we found and keep going
771          */
772         if (state->start == start && state->end <= end) {
773                 struct rb_node *next_node;
774                 if (state->state & exclusive_bits) {
775                         *failed_start = state->start;
776                         err = -EEXIST;
777                         goto out;
778                 }
779
780                 err = set_state_bits(tree, state, &bits);
781                 if (err)
782                         goto out;
783
784                 cache_state(state, cached_state);
785                 merge_state(tree, state);
786                 if (last_end == (u64)-1)
787                         goto out;
788
789                 start = last_end + 1;
790                 next_node = rb_next(&state->rb_node);
791                 if (next_node && start < end && prealloc && !need_resched()) {
792                         state = rb_entry(next_node, struct extent_state,
793                                          rb_node);
794                         if (state->start == start)
795                                 goto hit_next;
796                 }
797                 goto search_again;
798         }
799
800         /*
801          *     | ---- desired range ---- |
802          * | state |
803          *   or
804          * | ------------- state -------------- |
805          *
806          * We need to split the extent we found, and may flip bits on
807          * second half.
808          *
809          * If the extent we found extends past our
810          * range, we just split and search again.  It'll get split
811          * again the next time though.
812          *
813          * If the extent we found is inside our range, we set the
814          * desired bit on it.
815          */
816         if (state->start < start) {
817                 if (state->state & exclusive_bits) {
818                         *failed_start = start;
819                         err = -EEXIST;
820                         goto out;
821                 }
822
823                 prealloc = alloc_extent_state_atomic(prealloc);
824                 BUG_ON(!prealloc);
825                 err = split_state(tree, state, prealloc, start);
826                 BUG_ON(err == -EEXIST);
827                 prealloc = NULL;
828                 if (err)
829                         goto out;
830                 if (state->end <= end) {
831                         err = set_state_bits(tree, state, &bits);
832                         if (err)
833                                 goto out;
834                         cache_state(state, cached_state);
835                         merge_state(tree, state);
836                         if (last_end == (u64)-1)
837                                 goto out;
838                         start = last_end + 1;
839                 }
840                 goto search_again;
841         }
842         /*
843          * | ---- desired range ---- |
844          *     | state | or               | state |
845          *
846          * There's a hole, we need to insert something in it and
847          * ignore the extent we found.
848          */
849         if (state->start > start) {
850                 u64 this_end;
851                 if (end < last_start)
852                         this_end = end;
853                 else
854                         this_end = last_start - 1;
855
856                 prealloc = alloc_extent_state_atomic(prealloc);
857                 BUG_ON(!prealloc);
858
859                 /*
860                  * Avoid to free 'prealloc' if it can be merged with
861                  * the later extent.
862                  */
863                 err = insert_state(tree, prealloc, start, this_end,
864                                    &bits);
865                 BUG_ON(err == -EEXIST);
866                 if (err) {
867                         free_extent_state(prealloc);
868                         prealloc = NULL;
869                         goto out;
870                 }
871                 cache_state(prealloc, cached_state);
872                 prealloc = NULL;
873                 start = this_end + 1;
874                 goto search_again;
875         }
876         /*
877          * | ---- desired range ---- |
878          *                        | state |
879          * We need to split the extent, and set the bit
880          * on the first half
881          */
882         if (state->start <= end && state->end > end) {
883                 if (state->state & exclusive_bits) {
884                         *failed_start = start;
885                         err = -EEXIST;
886                         goto out;
887                 }
888
889                 prealloc = alloc_extent_state_atomic(prealloc);
890                 BUG_ON(!prealloc);
891                 err = split_state(tree, state, prealloc, end + 1);
892                 BUG_ON(err == -EEXIST);
893
894                 err = set_state_bits(tree, prealloc, &bits);
895                 if (err) {
896                         prealloc = NULL;
897                         goto out;
898                 }
899                 cache_state(prealloc, cached_state);
900                 merge_state(tree, prealloc);
901                 prealloc = NULL;
902                 goto out;
903         }
904
905         goto search_again;
906
907 out:
908         spin_unlock(&tree->lock);
909         if (prealloc)
910                 free_extent_state(prealloc);
911
912         return err;
913
914 search_again:
915         if (start > end)
916                 goto out;
917         spin_unlock(&tree->lock);
918         if (mask & __GFP_WAIT)
919                 cond_resched();
920         goto again;
921 }
922
923 /* wrappers around set/clear extent bit */
924 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
925                      gfp_t mask)
926 {
927         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
928                               NULL, mask);
929 }
930
931 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
932                     int bits, gfp_t mask)
933 {
934         return set_extent_bit(tree, start, end, bits, 0, NULL,
935                               NULL, mask);
936 }
937
938 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
939                       int bits, gfp_t mask)
940 {
941         return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
942 }
943
944 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
945                         struct extent_state **cached_state, gfp_t mask)
946 {
947         return set_extent_bit(tree, start, end,
948                               EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
949                               0, NULL, cached_state, mask);
950 }
951
952 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
953                        gfp_t mask)
954 {
955         return clear_extent_bit(tree, start, end,
956                                 EXTENT_DIRTY | EXTENT_DELALLOC |
957                                 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
958 }
959
960 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
961                      gfp_t mask)
962 {
963         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
964                               NULL, mask);
965 }
966
967 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
968                         struct extent_state **cached_state, gfp_t mask)
969 {
970         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
971                               NULL, cached_state, mask);
972 }
973
974 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
975                                  u64 end, struct extent_state **cached_state,
976                                  gfp_t mask)
977 {
978         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
979                                 cached_state, mask);
980 }
981
982 /*
983  * either insert or lock state struct between start and end use mask to tell
984  * us if waiting is desired.
985  */
986 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
987                      int bits, struct extent_state **cached_state, gfp_t mask)
988 {
989         int err;
990         u64 failed_start;
991         while (1) {
992                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
993                                      EXTENT_LOCKED, &failed_start,
994                                      cached_state, mask);
995                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
996                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
997                         start = failed_start;
998                 } else {
999                         break;
1000                 }
1001                 WARN_ON(start > end);
1002         }
1003         return err;
1004 }
1005
1006 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1007 {
1008         return lock_extent_bits(tree, start, end, 0, NULL, mask);
1009 }
1010
1011 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1012                     gfp_t mask)
1013 {
1014         int err;
1015         u64 failed_start;
1016
1017         err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1018                              &failed_start, NULL, mask);
1019         if (err == -EEXIST) {
1020                 if (failed_start > start)
1021                         clear_extent_bit(tree, start, failed_start - 1,
1022                                          EXTENT_LOCKED, 1, 0, NULL, mask);
1023                 return 0;
1024         }
1025         return 1;
1026 }
1027
1028 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1029                          struct extent_state **cached, gfp_t mask)
1030 {
1031         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1032                                 mask);
1033 }
1034
1035 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1036 {
1037         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1038                                 mask);
1039 }
1040
1041 /*
1042  * helper function to set both pages and extents in the tree writeback
1043  */
1044 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1045 {
1046         unsigned long index = start >> PAGE_CACHE_SHIFT;
1047         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1048         struct page *page;
1049
1050         while (index <= end_index) {
1051                 page = find_get_page(tree->mapping, index);
1052                 BUG_ON(!page);
1053                 set_page_writeback(page);
1054                 page_cache_release(page);
1055                 index++;
1056         }
1057         return 0;
1058 }
1059
1060 /*
1061  * find the first offset in the io tree with 'bits' set. zero is
1062  * returned if we find something, and *start_ret and *end_ret are
1063  * set to reflect the state struct that was found.
1064  *
1065  * If nothing was found, 1 is returned, < 0 on error
1066  */
1067 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1068                           u64 *start_ret, u64 *end_ret, int bits)
1069 {
1070         struct rb_node *node;
1071         struct extent_state *state;
1072         int ret = 1;
1073
1074         spin_lock(&tree->lock);
1075         /*
1076          * this search will find all the extents that end after
1077          * our range starts.
1078          */
1079         node = tree_search(tree, start);
1080         if (!node)
1081                 goto out;
1082
1083         while (1) {
1084                 state = rb_entry(node, struct extent_state, rb_node);
1085                 if (state->end >= start && (state->state & bits)) {
1086                         *start_ret = state->start;
1087                         *end_ret = state->end;
1088                         ret = 0;
1089                         break;
1090                 }
1091                 node = rb_next(node);
1092                 if (!node)
1093                         break;
1094         }
1095 out:
1096         spin_unlock(&tree->lock);
1097         return ret;
1098 }
1099
1100 /* find the first state struct with 'bits' set after 'start', and
1101  * return it.  tree->lock must be held.  NULL will returned if
1102  * nothing was found after 'start'
1103  */
1104 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1105                                                  u64 start, int bits)
1106 {
1107         struct rb_node *node;
1108         struct extent_state *state;
1109
1110         /*
1111          * this search will find all the extents that end after
1112          * our range starts.
1113          */
1114         node = tree_search(tree, start);
1115         if (!node)
1116                 goto out;
1117
1118         while (1) {
1119                 state = rb_entry(node, struct extent_state, rb_node);
1120                 if (state->end >= start && (state->state & bits))
1121                         return state;
1122
1123                 node = rb_next(node);
1124                 if (!node)
1125                         break;
1126         }
1127 out:
1128         return NULL;
1129 }
1130
1131 /*
1132  * find a contiguous range of bytes in the file marked as delalloc, not
1133  * more than 'max_bytes'.  start and end are used to return the range,
1134  *
1135  * 1 is returned if we find something, 0 if nothing was in the tree
1136  */
1137 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1138                                         u64 *start, u64 *end, u64 max_bytes,
1139                                         struct extent_state **cached_state)
1140 {
1141         struct rb_node *node;
1142         struct extent_state *state;
1143         u64 cur_start = *start;
1144         u64 found = 0;
1145         u64 total_bytes = 0;
1146
1147         spin_lock(&tree->lock);
1148
1149         /*
1150          * this search will find all the extents that end after
1151          * our range starts.
1152          */
1153         node = tree_search(tree, cur_start);
1154         if (!node) {
1155                 if (!found)
1156                         *end = (u64)-1;
1157                 goto out;
1158         }
1159
1160         while (1) {
1161                 state = rb_entry(node, struct extent_state, rb_node);
1162                 if (found && (state->start != cur_start ||
1163                               (state->state & EXTENT_BOUNDARY))) {
1164                         goto out;
1165                 }
1166                 if (!(state->state & EXTENT_DELALLOC)) {
1167                         if (!found)
1168                                 *end = state->end;
1169                         goto out;
1170                 }
1171                 if (!found) {
1172                         *start = state->start;
1173                         *cached_state = state;
1174                         atomic_inc(&state->refs);
1175                 }
1176                 found++;
1177                 *end = state->end;
1178                 cur_start = state->end + 1;
1179                 node = rb_next(node);
1180                 if (!node)
1181                         break;
1182                 total_bytes += state->end - state->start + 1;
1183                 if (total_bytes >= max_bytes)
1184                         break;
1185         }
1186 out:
1187         spin_unlock(&tree->lock);
1188         return found;
1189 }
1190
1191 static noinline int __unlock_for_delalloc(struct inode *inode,
1192                                           struct page *locked_page,
1193                                           u64 start, u64 end)
1194 {
1195         int ret;
1196         struct page *pages[16];
1197         unsigned long index = start >> PAGE_CACHE_SHIFT;
1198         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1199         unsigned long nr_pages = end_index - index + 1;
1200         int i;
1201
1202         if (index == locked_page->index && end_index == index)
1203                 return 0;
1204
1205         while (nr_pages > 0) {
1206                 ret = find_get_pages_contig(inode->i_mapping, index,
1207                                      min_t(unsigned long, nr_pages,
1208                                      ARRAY_SIZE(pages)), pages);
1209                 for (i = 0; i < ret; i++) {
1210                         if (pages[i] != locked_page)
1211                                 unlock_page(pages[i]);
1212                         page_cache_release(pages[i]);
1213                 }
1214                 nr_pages -= ret;
1215                 index += ret;
1216                 cond_resched();
1217         }
1218         return 0;
1219 }
1220
1221 static noinline int lock_delalloc_pages(struct inode *inode,
1222                                         struct page *locked_page,
1223                                         u64 delalloc_start,
1224                                         u64 delalloc_end)
1225 {
1226         unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1227         unsigned long start_index = index;
1228         unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1229         unsigned long pages_locked = 0;
1230         struct page *pages[16];
1231         unsigned long nrpages;
1232         int ret;
1233         int i;
1234
1235         /* the caller is responsible for locking the start index */
1236         if (index == locked_page->index && index == end_index)
1237                 return 0;
1238
1239         /* skip the page at the start index */
1240         nrpages = end_index - index + 1;
1241         while (nrpages > 0) {
1242                 ret = find_get_pages_contig(inode->i_mapping, index,
1243                                      min_t(unsigned long,
1244                                      nrpages, ARRAY_SIZE(pages)), pages);
1245                 if (ret == 0) {
1246                         ret = -EAGAIN;
1247                         goto done;
1248                 }
1249                 /* now we have an array of pages, lock them all */
1250                 for (i = 0; i < ret; i++) {
1251                         /*
1252                          * the caller is taking responsibility for
1253                          * locked_page
1254                          */
1255                         if (pages[i] != locked_page) {
1256                                 lock_page(pages[i]);
1257                                 if (!PageDirty(pages[i]) ||
1258                                     pages[i]->mapping != inode->i_mapping) {
1259                                         ret = -EAGAIN;
1260                                         unlock_page(pages[i]);
1261                                         page_cache_release(pages[i]);
1262                                         goto done;
1263                                 }
1264                         }
1265                         page_cache_release(pages[i]);
1266                         pages_locked++;
1267                 }
1268                 nrpages -= ret;
1269                 index += ret;
1270                 cond_resched();
1271         }
1272         ret = 0;
1273 done:
1274         if (ret && pages_locked) {
1275                 __unlock_for_delalloc(inode, locked_page,
1276                               delalloc_start,
1277                               ((u64)(start_index + pages_locked - 1)) <<
1278                               PAGE_CACHE_SHIFT);
1279         }
1280         return ret;
1281 }
1282
1283 /*
1284  * find a contiguous range of bytes in the file marked as delalloc, not
1285  * more than 'max_bytes'.  start and end are used to return the range,
1286  *
1287  * 1 is returned if we find something, 0 if nothing was in the tree
1288  */
1289 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1290                                              struct extent_io_tree *tree,
1291                                              struct page *locked_page,
1292                                              u64 *start, u64 *end,
1293                                              u64 max_bytes)
1294 {
1295         u64 delalloc_start;
1296         u64 delalloc_end;
1297         u64 found;
1298         struct extent_state *cached_state = NULL;
1299         int ret;
1300         int loops = 0;
1301
1302 again:
1303         /* step one, find a bunch of delalloc bytes starting at start */
1304         delalloc_start = *start;
1305         delalloc_end = 0;
1306         found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1307                                     max_bytes, &cached_state);
1308         if (!found || delalloc_end <= *start) {
1309                 *start = delalloc_start;
1310                 *end = delalloc_end;
1311                 free_extent_state(cached_state);
1312                 return found;
1313         }
1314
1315         /*
1316          * start comes from the offset of locked_page.  We have to lock
1317          * pages in order, so we can't process delalloc bytes before
1318          * locked_page
1319          */
1320         if (delalloc_start < *start)
1321                 delalloc_start = *start;
1322
1323         /*
1324          * make sure to limit the number of pages we try to lock down
1325          * if we're looping.
1326          */
1327         if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1328                 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1329
1330         /* step two, lock all the pages after the page that has start */
1331         ret = lock_delalloc_pages(inode, locked_page,
1332                                   delalloc_start, delalloc_end);
1333         if (ret == -EAGAIN) {
1334                 /* some of the pages are gone, lets avoid looping by
1335                  * shortening the size of the delalloc range we're searching
1336                  */
1337                 free_extent_state(cached_state);
1338                 if (!loops) {
1339                         unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1340                         max_bytes = PAGE_CACHE_SIZE - offset;
1341                         loops = 1;
1342                         goto again;
1343                 } else {
1344                         found = 0;
1345                         goto out_failed;
1346                 }
1347         }
1348         BUG_ON(ret);
1349
1350         /* step three, lock the state bits for the whole range */
1351         lock_extent_bits(tree, delalloc_start, delalloc_end,
1352                          0, &cached_state, GFP_NOFS);
1353
1354         /* then test to make sure it is all still delalloc */
1355         ret = test_range_bit(tree, delalloc_start, delalloc_end,
1356                              EXTENT_DELALLOC, 1, cached_state);
1357         if (!ret) {
1358                 unlock_extent_cached(tree, delalloc_start, delalloc_end,
1359                                      &cached_state, GFP_NOFS);
1360                 __unlock_for_delalloc(inode, locked_page,
1361                               delalloc_start, delalloc_end);
1362                 cond_resched();
1363                 goto again;
1364         }
1365         free_extent_state(cached_state);
1366         *start = delalloc_start;
1367         *end = delalloc_end;
1368 out_failed:
1369         return found;
1370 }
1371
1372 int extent_clear_unlock_delalloc(struct inode *inode,
1373                                 struct extent_io_tree *tree,
1374                                 u64 start, u64 end, struct page *locked_page,
1375                                 unsigned long op)
1376 {
1377         int ret;
1378         struct page *pages[16];
1379         unsigned long index = start >> PAGE_CACHE_SHIFT;
1380         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1381         unsigned long nr_pages = end_index - index + 1;
1382         int i;
1383         int clear_bits = 0;
1384
1385         if (op & EXTENT_CLEAR_UNLOCK)
1386                 clear_bits |= EXTENT_LOCKED;
1387         if (op & EXTENT_CLEAR_DIRTY)
1388                 clear_bits |= EXTENT_DIRTY;
1389
1390         if (op & EXTENT_CLEAR_DELALLOC)
1391                 clear_bits |= EXTENT_DELALLOC;
1392
1393         clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1394         if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1395                     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1396                     EXTENT_SET_PRIVATE2)))
1397                 return 0;
1398
1399         while (nr_pages > 0) {
1400                 ret = find_get_pages_contig(inode->i_mapping, index,
1401                                      min_t(unsigned long,
1402                                      nr_pages, ARRAY_SIZE(pages)), pages);
1403                 for (i = 0; i < ret; i++) {
1404
1405                         if (op & EXTENT_SET_PRIVATE2)
1406                                 SetPagePrivate2(pages[i]);
1407
1408                         if (pages[i] == locked_page) {
1409                                 page_cache_release(pages[i]);
1410                                 continue;
1411                         }
1412                         if (op & EXTENT_CLEAR_DIRTY)
1413                                 clear_page_dirty_for_io(pages[i]);
1414                         if (op & EXTENT_SET_WRITEBACK)
1415                                 set_page_writeback(pages[i]);
1416                         if (op & EXTENT_END_WRITEBACK)
1417                                 end_page_writeback(pages[i]);
1418                         if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1419                                 unlock_page(pages[i]);
1420                         page_cache_release(pages[i]);
1421                 }
1422                 nr_pages -= ret;
1423                 index += ret;
1424                 cond_resched();
1425         }
1426         return 0;
1427 }
1428
1429 /*
1430  * count the number of bytes in the tree that have a given bit(s)
1431  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1432  * cached.  The total number found is returned.
1433  */
1434 u64 count_range_bits(struct extent_io_tree *tree,
1435                      u64 *start, u64 search_end, u64 max_bytes,
1436                      unsigned long bits, int contig)
1437 {
1438         struct rb_node *node;
1439         struct extent_state *state;
1440         u64 cur_start = *start;
1441         u64 total_bytes = 0;
1442         u64 last = 0;
1443         int found = 0;
1444
1445         if (search_end <= cur_start) {
1446                 WARN_ON(1);
1447                 return 0;
1448         }
1449
1450         spin_lock(&tree->lock);
1451         if (cur_start == 0 && bits == EXTENT_DIRTY) {
1452                 total_bytes = tree->dirty_bytes;
1453                 goto out;
1454         }
1455         /*
1456          * this search will find all the extents that end after
1457          * our range starts.
1458          */
1459         node = tree_search(tree, cur_start);
1460         if (!node)
1461                 goto out;
1462
1463         while (1) {
1464                 state = rb_entry(node, struct extent_state, rb_node);
1465                 if (state->start > search_end)
1466                         break;
1467                 if (contig && found && state->start > last + 1)
1468                         break;
1469                 if (state->end >= cur_start && (state->state & bits) == bits) {
1470                         total_bytes += min(search_end, state->end) + 1 -
1471                                        max(cur_start, state->start);
1472                         if (total_bytes >= max_bytes)
1473                                 break;
1474                         if (!found) {
1475                                 *start = max(cur_start, state->start);
1476                                 found = 1;
1477                         }
1478                         last = state->end;
1479                 } else if (contig && found) {
1480                         break;
1481                 }
1482                 node = rb_next(node);
1483                 if (!node)
1484                         break;
1485         }
1486 out:
1487         spin_unlock(&tree->lock);
1488         return total_bytes;
1489 }
1490
1491 /*
1492  * set the private field for a given byte offset in the tree.  If there isn't
1493  * an extent_state there already, this does nothing.
1494  */
1495 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1496 {
1497         struct rb_node *node;
1498         struct extent_state *state;
1499         int ret = 0;
1500
1501         spin_lock(&tree->lock);
1502         /*
1503          * this search will find all the extents that end after
1504          * our range starts.
1505          */
1506         node = tree_search(tree, start);
1507         if (!node) {
1508                 ret = -ENOENT;
1509                 goto out;
1510         }
1511         state = rb_entry(node, struct extent_state, rb_node);
1512         if (state->start != start) {
1513                 ret = -ENOENT;
1514                 goto out;
1515         }
1516         state->private = private;
1517 out:
1518         spin_unlock(&tree->lock);
1519         return ret;
1520 }
1521
1522 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1523 {
1524         struct rb_node *node;
1525         struct extent_state *state;
1526         int ret = 0;
1527
1528         spin_lock(&tree->lock);
1529         /*
1530          * this search will find all the extents that end after
1531          * our range starts.
1532          */
1533         node = tree_search(tree, start);
1534         if (!node) {
1535                 ret = -ENOENT;
1536                 goto out;
1537         }
1538         state = rb_entry(node, struct extent_state, rb_node);
1539         if (state->start != start) {
1540                 ret = -ENOENT;
1541                 goto out;
1542         }
1543         *private = state->private;
1544 out:
1545         spin_unlock(&tree->lock);
1546         return ret;
1547 }
1548
1549 /*
1550  * searches a range in the state tree for a given mask.
1551  * If 'filled' == 1, this returns 1 only if every extent in the tree
1552  * has the bits set.  Otherwise, 1 is returned if any bit in the
1553  * range is found set.
1554  */
1555 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1556                    int bits, int filled, struct extent_state *cached)
1557 {
1558         struct extent_state *state = NULL;
1559         struct rb_node *node;
1560         int bitset = 0;
1561
1562         spin_lock(&tree->lock);
1563         if (cached && cached->tree && cached->start <= start &&
1564             cached->end > start)
1565                 node = &cached->rb_node;
1566         else
1567                 node = tree_search(tree, start);
1568         while (node && start <= end) {
1569                 state = rb_entry(node, struct extent_state, rb_node);
1570
1571                 if (filled && state->start > start) {
1572                         bitset = 0;
1573                         break;
1574                 }
1575
1576                 if (state->start > end)
1577                         break;
1578
1579                 if (state->state & bits) {
1580                         bitset = 1;
1581                         if (!filled)
1582                                 break;
1583                 } else if (filled) {
1584                         bitset = 0;
1585                         break;
1586                 }
1587
1588                 if (state->end == (u64)-1)
1589                         break;
1590
1591                 start = state->end + 1;
1592                 if (start > end)
1593                         break;
1594                 node = rb_next(node);
1595                 if (!node) {
1596                         if (filled)
1597                                 bitset = 0;
1598                         break;
1599                 }
1600         }
1601         spin_unlock(&tree->lock);
1602         return bitset;
1603 }
1604
1605 /*
1606  * helper function to set a given page up to date if all the
1607  * extents in the tree for that page are up to date
1608  */
1609 static int check_page_uptodate(struct extent_io_tree *tree,
1610                                struct page *page)
1611 {
1612         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1613         u64 end = start + PAGE_CACHE_SIZE - 1;
1614         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1615                 SetPageUptodate(page);
1616         return 0;
1617 }
1618
1619 /*
1620  * helper function to unlock a page if all the extents in the tree
1621  * for that page are unlocked
1622  */
1623 static int check_page_locked(struct extent_io_tree *tree,
1624                              struct page *page)
1625 {
1626         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1627         u64 end = start + PAGE_CACHE_SIZE - 1;
1628         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1629                 unlock_page(page);
1630         return 0;
1631 }
1632
1633 /*
1634  * helper function to end page writeback if all the extents
1635  * in the tree for that page are done with writeback
1636  */
1637 static int check_page_writeback(struct extent_io_tree *tree,
1638                              struct page *page)
1639 {
1640         end_page_writeback(page);
1641         return 0;
1642 }
1643
1644 /* lots and lots of room for performance fixes in the end_bio funcs */
1645
1646 /*
1647  * after a writepage IO is done, we need to:
1648  * clear the uptodate bits on error
1649  * clear the writeback bits in the extent tree for this IO
1650  * end_page_writeback if the page has no more pending IO
1651  *
1652  * Scheduling is not allowed, so the extent state tree is expected
1653  * to have one and only one object corresponding to this IO.
1654  */
1655 static void end_bio_extent_writepage(struct bio *bio, int err)
1656 {
1657         int uptodate = err == 0;
1658         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1659         struct extent_io_tree *tree;
1660         u64 start;
1661         u64 end;
1662         int whole_page;
1663         int ret;
1664
1665         do {
1666                 struct page *page = bvec->bv_page;
1667                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1668
1669                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1670                          bvec->bv_offset;
1671                 end = start + bvec->bv_len - 1;
1672
1673                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1674                         whole_page = 1;
1675                 else
1676                         whole_page = 0;
1677
1678                 if (--bvec >= bio->bi_io_vec)
1679                         prefetchw(&bvec->bv_page->flags);
1680                 if (tree->ops && tree->ops->writepage_end_io_hook) {
1681                         ret = tree->ops->writepage_end_io_hook(page, start,
1682                                                        end, NULL, uptodate);
1683                         if (ret)
1684                                 uptodate = 0;
1685                 }
1686
1687                 if (!uptodate && tree->ops &&
1688                     tree->ops->writepage_io_failed_hook) {
1689                         ret = tree->ops->writepage_io_failed_hook(bio, page,
1690                                                          start, end, NULL);
1691                         if (ret == 0) {
1692                                 uptodate = (err == 0);
1693                                 continue;
1694                         }
1695                 }
1696
1697                 if (!uptodate) {
1698                         clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1699                         ClearPageUptodate(page);
1700                         SetPageError(page);
1701                 }
1702
1703                 if (whole_page)
1704                         end_page_writeback(page);
1705                 else
1706                         check_page_writeback(tree, page);
1707         } while (bvec >= bio->bi_io_vec);
1708
1709         bio_put(bio);
1710 }
1711
1712 /*
1713  * after a readpage IO is done, we need to:
1714  * clear the uptodate bits on error
1715  * set the uptodate bits if things worked
1716  * set the page up to date if all extents in the tree are uptodate
1717  * clear the lock bit in the extent tree
1718  * unlock the page if there are no other extents locked for it
1719  *
1720  * Scheduling is not allowed, so the extent state tree is expected
1721  * to have one and only one object corresponding to this IO.
1722  */
1723 static void end_bio_extent_readpage(struct bio *bio, int err)
1724 {
1725         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1726         struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1727         struct bio_vec *bvec = bio->bi_io_vec;
1728         struct extent_io_tree *tree;
1729         u64 start;
1730         u64 end;
1731         int whole_page;
1732         int ret;
1733
1734         if (err)
1735                 uptodate = 0;
1736
1737         do {
1738                 struct page *page = bvec->bv_page;
1739                 struct extent_state *cached = NULL;
1740                 struct extent_state *state;
1741
1742                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1743
1744                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1745                         bvec->bv_offset;
1746                 end = start + bvec->bv_len - 1;
1747
1748                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1749                         whole_page = 1;
1750                 else
1751                         whole_page = 0;
1752
1753                 if (++bvec <= bvec_end)
1754                         prefetchw(&bvec->bv_page->flags);
1755
1756                 spin_lock(&tree->lock);
1757                 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
1758                 if (state && state->start == start) {
1759                         /*
1760                          * take a reference on the state, unlock will drop
1761                          * the ref
1762                          */
1763                         cache_state(state, &cached);
1764                 }
1765                 spin_unlock(&tree->lock);
1766
1767                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1768                         ret = tree->ops->readpage_end_io_hook(page, start, end,
1769                                                               state);
1770                         if (ret)
1771                                 uptodate = 0;
1772                 }
1773                 if (!uptodate && tree->ops &&
1774                     tree->ops->readpage_io_failed_hook) {
1775                         ret = tree->ops->readpage_io_failed_hook(bio, page,
1776                                                          start, end, NULL);
1777                         if (ret == 0) {
1778                                 uptodate =
1779                                         test_bit(BIO_UPTODATE, &bio->bi_flags);
1780                                 if (err)
1781                                         uptodate = 0;
1782                                 uncache_state(&cached);
1783                                 continue;
1784                         }
1785                 }
1786
1787                 if (uptodate) {
1788                         set_extent_uptodate(tree, start, end, &cached,
1789                                             GFP_ATOMIC);
1790                 }
1791                 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
1792
1793                 if (whole_page) {
1794                         if (uptodate) {
1795                                 SetPageUptodate(page);
1796                         } else {
1797                                 ClearPageUptodate(page);
1798                                 SetPageError(page);
1799                         }
1800                         unlock_page(page);
1801                 } else {
1802                         if (uptodate) {
1803                                 check_page_uptodate(tree, page);
1804                         } else {
1805                                 ClearPageUptodate(page);
1806                                 SetPageError(page);
1807                         }
1808                         check_page_locked(tree, page);
1809                 }
1810         } while (bvec <= bvec_end);
1811
1812         bio_put(bio);
1813 }
1814
1815 struct bio *
1816 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1817                 gfp_t gfp_flags)
1818 {
1819         struct bio *bio;
1820
1821         bio = bio_alloc(gfp_flags, nr_vecs);
1822
1823         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1824                 while (!bio && (nr_vecs /= 2))
1825                         bio = bio_alloc(gfp_flags, nr_vecs);
1826         }
1827
1828         if (bio) {
1829                 bio->bi_size = 0;
1830                 bio->bi_bdev = bdev;
1831                 bio->bi_sector = first_sector;
1832         }
1833         return bio;
1834 }
1835
1836 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1837                           unsigned long bio_flags)
1838 {
1839         int ret = 0;
1840         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1841         struct page *page = bvec->bv_page;
1842         struct extent_io_tree *tree = bio->bi_private;
1843         u64 start;
1844
1845         start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1846
1847         bio->bi_private = NULL;
1848
1849         bio_get(bio);
1850
1851         if (tree->ops && tree->ops->submit_bio_hook)
1852                 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1853                                            mirror_num, bio_flags, start);
1854         else
1855                 submit_bio(rw, bio);
1856         if (bio_flagged(bio, BIO_EOPNOTSUPP))
1857                 ret = -EOPNOTSUPP;
1858         bio_put(bio);
1859         return ret;
1860 }
1861
1862 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1863                               struct page *page, sector_t sector,
1864                               size_t size, unsigned long offset,
1865                               struct block_device *bdev,
1866                               struct bio **bio_ret,
1867                               unsigned long max_pages,
1868                               bio_end_io_t end_io_func,
1869                               int mirror_num,
1870                               unsigned long prev_bio_flags,
1871                               unsigned long bio_flags)
1872 {
1873         int ret = 0;
1874         struct bio *bio;
1875         int nr;
1876         int contig = 0;
1877         int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1878         int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1879         size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1880
1881         if (bio_ret && *bio_ret) {
1882                 bio = *bio_ret;
1883                 if (old_compressed)
1884                         contig = bio->bi_sector == sector;
1885                 else
1886                         contig = bio->bi_sector + (bio->bi_size >> 9) ==
1887                                 sector;
1888
1889                 if (prev_bio_flags != bio_flags || !contig ||
1890                     (tree->ops && tree->ops->merge_bio_hook &&
1891                      tree->ops->merge_bio_hook(page, offset, page_size, bio,
1892                                                bio_flags)) ||
1893                     bio_add_page(bio, page, page_size, offset) < page_size) {
1894                         ret = submit_one_bio(rw, bio, mirror_num,
1895                                              prev_bio_flags);
1896                         bio = NULL;
1897                 } else {
1898                         return 0;
1899                 }
1900         }
1901         if (this_compressed)
1902                 nr = BIO_MAX_PAGES;
1903         else
1904                 nr = bio_get_nr_vecs(bdev);
1905
1906         bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1907         if (!bio)
1908                 return -ENOMEM;
1909
1910         bio_add_page(bio, page, page_size, offset);
1911         bio->bi_end_io = end_io_func;
1912         bio->bi_private = tree;
1913
1914         if (bio_ret)
1915                 *bio_ret = bio;
1916         else
1917                 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1918
1919         return ret;
1920 }
1921
1922 void set_page_extent_mapped(struct page *page)
1923 {
1924         if (!PagePrivate(page)) {
1925                 SetPagePrivate(page);
1926                 page_cache_get(page);
1927                 set_page_private(page, EXTENT_PAGE_PRIVATE);
1928         }
1929 }
1930
1931 static void set_page_extent_head(struct page *page, unsigned long len)
1932 {
1933         WARN_ON(!PagePrivate(page));
1934         set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1935 }
1936
1937 /*
1938  * basic readpage implementation.  Locked extent state structs are inserted
1939  * into the tree that are removed when the IO is done (by the end_io
1940  * handlers)
1941  */
1942 static int __extent_read_full_page(struct extent_io_tree *tree,
1943                                    struct page *page,
1944                                    get_extent_t *get_extent,
1945                                    struct bio **bio, int mirror_num,
1946                                    unsigned long *bio_flags)
1947 {
1948         struct inode *inode = page->mapping->host;
1949         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1950         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1951         u64 end;
1952         u64 cur = start;
1953         u64 extent_offset;
1954         u64 last_byte = i_size_read(inode);
1955         u64 block_start;
1956         u64 cur_end;
1957         sector_t sector;
1958         struct extent_map *em;
1959         struct block_device *bdev;
1960         struct btrfs_ordered_extent *ordered;
1961         int ret;
1962         int nr = 0;
1963         size_t pg_offset = 0;
1964         size_t iosize;
1965         size_t disk_io_size;
1966         size_t blocksize = inode->i_sb->s_blocksize;
1967         unsigned long this_bio_flag = 0;
1968
1969         set_page_extent_mapped(page);
1970
1971         end = page_end;
1972         while (1) {
1973                 lock_extent(tree, start, end, GFP_NOFS);
1974                 ordered = btrfs_lookup_ordered_extent(inode, start);
1975                 if (!ordered)
1976                         break;
1977                 unlock_extent(tree, start, end, GFP_NOFS);
1978                 btrfs_start_ordered_extent(inode, ordered, 1);
1979                 btrfs_put_ordered_extent(ordered);
1980         }
1981
1982         if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1983                 char *userpage;
1984                 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1985
1986                 if (zero_offset) {
1987                         iosize = PAGE_CACHE_SIZE - zero_offset;
1988                         userpage = kmap_atomic(page, KM_USER0);
1989                         memset(userpage + zero_offset, 0, iosize);
1990                         flush_dcache_page(page);
1991                         kunmap_atomic(userpage, KM_USER0);
1992                 }
1993         }
1994         while (cur <= end) {
1995                 if (cur >= last_byte) {
1996                         char *userpage;
1997                         struct extent_state *cached = NULL;
1998
1999                         iosize = PAGE_CACHE_SIZE - pg_offset;
2000                         userpage = kmap_atomic(page, KM_USER0);
2001                         memset(userpage + pg_offset, 0, iosize);
2002                         flush_dcache_page(page);
2003                         kunmap_atomic(userpage, KM_USER0);
2004                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2005                                             &cached, GFP_NOFS);
2006                         unlock_extent_cached(tree, cur, cur + iosize - 1,
2007                                              &cached, GFP_NOFS);
2008                         break;
2009                 }
2010                 em = get_extent(inode, page, pg_offset, cur,
2011                                 end - cur + 1, 0);
2012                 if (IS_ERR_OR_NULL(em)) {
2013                         SetPageError(page);
2014                         unlock_extent(tree, cur, end, GFP_NOFS);
2015                         break;
2016                 }
2017                 extent_offset = cur - em->start;
2018                 BUG_ON(extent_map_end(em) <= cur);
2019                 BUG_ON(end < cur);
2020
2021                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2022                         this_bio_flag = EXTENT_BIO_COMPRESSED;
2023                         extent_set_compress_type(&this_bio_flag,
2024                                                  em->compress_type);
2025                 }
2026
2027                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2028                 cur_end = min(extent_map_end(em) - 1, end);
2029                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2030                 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2031                         disk_io_size = em->block_len;
2032                         sector = em->block_start >> 9;
2033                 } else {
2034                         sector = (em->block_start + extent_offset) >> 9;
2035                         disk_io_size = iosize;
2036                 }
2037                 bdev = em->bdev;
2038                 block_start = em->block_start;
2039                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2040                         block_start = EXTENT_MAP_HOLE;
2041                 free_extent_map(em);
2042                 em = NULL;
2043
2044                 /* we've found a hole, just zero and go on */
2045                 if (block_start == EXTENT_MAP_HOLE) {
2046                         char *userpage;
2047                         struct extent_state *cached = NULL;
2048
2049                         userpage = kmap_atomic(page, KM_USER0);
2050                         memset(userpage + pg_offset, 0, iosize);
2051                         flush_dcache_page(page);
2052                         kunmap_atomic(userpage, KM_USER0);
2053
2054                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2055                                             &cached, GFP_NOFS);
2056                         unlock_extent_cached(tree, cur, cur + iosize - 1,
2057                                              &cached, GFP_NOFS);
2058                         cur = cur + iosize;
2059                         pg_offset += iosize;
2060                         continue;
2061                 }
2062                 /* the get_extent function already copied into the page */
2063                 if (test_range_bit(tree, cur, cur_end,
2064                                    EXTENT_UPTODATE, 1, NULL)) {
2065                         check_page_uptodate(tree, page);
2066                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2067                         cur = cur + iosize;
2068                         pg_offset += iosize;
2069                         continue;
2070                 }
2071                 /* we have an inline extent but it didn't get marked up
2072                  * to date.  Error out
2073                  */
2074                 if (block_start == EXTENT_MAP_INLINE) {
2075                         SetPageError(page);
2076                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2077                         cur = cur + iosize;
2078                         pg_offset += iosize;
2079                         continue;
2080                 }
2081
2082                 ret = 0;
2083                 if (tree->ops && tree->ops->readpage_io_hook) {
2084                         ret = tree->ops->readpage_io_hook(page, cur,
2085                                                           cur + iosize - 1);
2086                 }
2087                 if (!ret) {
2088                         unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2089                         pnr -= page->index;
2090                         ret = submit_extent_page(READ, tree, page,
2091                                          sector, disk_io_size, pg_offset,
2092                                          bdev, bio, pnr,
2093                                          end_bio_extent_readpage, mirror_num,
2094                                          *bio_flags,
2095                                          this_bio_flag);
2096                         nr++;
2097                         *bio_flags = this_bio_flag;
2098                 }
2099                 if (ret)
2100                         SetPageError(page);
2101                 cur = cur + iosize;
2102                 pg_offset += iosize;
2103         }
2104         if (!nr) {
2105                 if (!PageError(page))
2106                         SetPageUptodate(page);
2107                 unlock_page(page);
2108         }
2109         return 0;
2110 }
2111
2112 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2113                             get_extent_t *get_extent)
2114 {
2115         struct bio *bio = NULL;
2116         unsigned long bio_flags = 0;
2117         int ret;
2118
2119         ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2120                                       &bio_flags);
2121         if (bio)
2122                 ret = submit_one_bio(READ, bio, 0, bio_flags);
2123         return ret;
2124 }
2125
2126 static noinline void update_nr_written(struct page *page,
2127                                       struct writeback_control *wbc,
2128                                       unsigned long nr_written)
2129 {
2130         wbc->nr_to_write -= nr_written;
2131         if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2132             wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2133                 page->mapping->writeback_index = page->index + nr_written;
2134 }
2135
2136 /*
2137  * the writepage semantics are similar to regular writepage.  extent
2138  * records are inserted to lock ranges in the tree, and as dirty areas
2139  * are found, they are marked writeback.  Then the lock bits are removed
2140  * and the end_io handler clears the writeback ranges
2141  */
2142 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2143                               void *data)
2144 {
2145         struct inode *inode = page->mapping->host;
2146         struct extent_page_data *epd = data;
2147         struct extent_io_tree *tree = epd->tree;
2148         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2149         u64 delalloc_start;
2150         u64 page_end = start + PAGE_CACHE_SIZE - 1;
2151         u64 end;
2152         u64 cur = start;
2153         u64 extent_offset;
2154         u64 last_byte = i_size_read(inode);
2155         u64 block_start;
2156         u64 iosize;
2157         sector_t sector;
2158         struct extent_state *cached_state = NULL;
2159         struct extent_map *em;
2160         struct block_device *bdev;
2161         int ret;
2162         int nr = 0;
2163         size_t pg_offset = 0;
2164         size_t blocksize;
2165         loff_t i_size = i_size_read(inode);
2166         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2167         u64 nr_delalloc;
2168         u64 delalloc_end;
2169         int page_started;
2170         int compressed;
2171         int write_flags;
2172         unsigned long nr_written = 0;
2173
2174         if (wbc->sync_mode == WB_SYNC_ALL)
2175                 write_flags = WRITE_SYNC;
2176         else
2177                 write_flags = WRITE;
2178
2179         trace___extent_writepage(page, inode, wbc);
2180
2181         WARN_ON(!PageLocked(page));
2182         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2183         if (page->index > end_index ||
2184            (page->index == end_index && !pg_offset)) {
2185                 page->mapping->a_ops->invalidatepage(page, 0);
2186                 unlock_page(page);
2187                 return 0;
2188         }
2189
2190         if (page->index == end_index) {
2191                 char *userpage;
2192
2193                 userpage = kmap_atomic(page, KM_USER0);
2194                 memset(userpage + pg_offset, 0,
2195                        PAGE_CACHE_SIZE - pg_offset);
2196                 kunmap_atomic(userpage, KM_USER0);
2197                 flush_dcache_page(page);
2198         }
2199         pg_offset = 0;
2200
2201         set_page_extent_mapped(page);
2202
2203         delalloc_start = start;
2204         delalloc_end = 0;
2205         page_started = 0;
2206         if (!epd->extent_locked) {
2207                 u64 delalloc_to_write = 0;
2208                 /*
2209                  * make sure the wbc mapping index is at least updated
2210                  * to this page.
2211                  */
2212                 update_nr_written(page, wbc, 0);
2213
2214                 while (delalloc_end < page_end) {
2215                         nr_delalloc = find_lock_delalloc_range(inode, tree,
2216                                                        page,
2217                                                        &delalloc_start,
2218                                                        &delalloc_end,
2219                                                        128 * 1024 * 1024);
2220                         if (nr_delalloc == 0) {
2221                                 delalloc_start = delalloc_end + 1;
2222                                 continue;
2223                         }
2224                         tree->ops->fill_delalloc(inode, page, delalloc_start,
2225                                                  delalloc_end, &page_started,
2226                                                  &nr_written);
2227                         /*
2228                          * delalloc_end is already one less than the total
2229                          * length, so we don't subtract one from
2230                          * PAGE_CACHE_SIZE
2231                          */
2232                         delalloc_to_write += (delalloc_end - delalloc_start +
2233                                               PAGE_CACHE_SIZE) >>
2234                                               PAGE_CACHE_SHIFT;
2235                         delalloc_start = delalloc_end + 1;
2236                 }
2237                 if (wbc->nr_to_write < delalloc_to_write) {
2238                         int thresh = 8192;
2239
2240                         if (delalloc_to_write < thresh * 2)
2241                                 thresh = delalloc_to_write;
2242                         wbc->nr_to_write = min_t(u64, delalloc_to_write,
2243                                                  thresh);
2244                 }
2245
2246                 /* did the fill delalloc function already unlock and start
2247                  * the IO?
2248                  */
2249                 if (page_started) {
2250                         ret = 0;
2251                         /*
2252                          * we've unlocked the page, so we can't update
2253                          * the mapping's writeback index, just update
2254                          * nr_to_write.
2255                          */
2256                         wbc->nr_to_write -= nr_written;
2257                         goto done_unlocked;
2258                 }
2259         }
2260         if (tree->ops && tree->ops->writepage_start_hook) {
2261                 ret = tree->ops->writepage_start_hook(page, start,
2262                                                       page_end);
2263                 if (ret == -EAGAIN) {
2264                         redirty_page_for_writepage(wbc, page);
2265                         update_nr_written(page, wbc, nr_written);
2266                         unlock_page(page);
2267                         ret = 0;
2268                         goto done_unlocked;
2269                 }
2270         }
2271
2272         /*
2273          * we don't want to touch the inode after unlocking the page,
2274          * so we update the mapping writeback index now
2275          */
2276         update_nr_written(page, wbc, nr_written + 1);
2277
2278         end = page_end;
2279         if (last_byte <= start) {
2280                 if (tree->ops && tree->ops->writepage_end_io_hook)
2281                         tree->ops->writepage_end_io_hook(page, start,
2282                                                          page_end, NULL, 1);
2283                 goto done;
2284         }
2285
2286         blocksize = inode->i_sb->s_blocksize;
2287
2288         while (cur <= end) {
2289                 if (cur >= last_byte) {
2290                         if (tree->ops && tree->ops->writepage_end_io_hook)
2291                                 tree->ops->writepage_end_io_hook(page, cur,
2292                                                          page_end, NULL, 1);
2293                         break;
2294                 }
2295                 em = epd->get_extent(inode, page, pg_offset, cur,
2296                                      end - cur + 1, 1);
2297                 if (IS_ERR_OR_NULL(em)) {
2298                         SetPageError(page);
2299                         break;
2300                 }
2301
2302                 extent_offset = cur - em->start;
2303                 BUG_ON(extent_map_end(em) <= cur);
2304                 BUG_ON(end < cur);
2305                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2306                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2307                 sector = (em->block_start + extent_offset) >> 9;
2308                 bdev = em->bdev;
2309                 block_start = em->block_start;
2310                 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2311                 free_extent_map(em);
2312                 em = NULL;
2313
2314                 /*
2315                  * compressed and inline extents are written through other
2316                  * paths in the FS
2317                  */
2318                 if (compressed || block_start == EXTENT_MAP_HOLE ||
2319                     block_start == EXTENT_MAP_INLINE) {
2320                         /*
2321                          * end_io notification does not happen here for
2322                          * compressed extents
2323                          */
2324                         if (!compressed && tree->ops &&
2325                             tree->ops->writepage_end_io_hook)
2326                                 tree->ops->writepage_end_io_hook(page, cur,
2327                                                          cur + iosize - 1,
2328                                                          NULL, 1);
2329                         else if (compressed) {
2330                                 /* we don't want to end_page_writeback on
2331                                  * a compressed extent.  this happens
2332                                  * elsewhere
2333                                  */
2334                                 nr++;
2335                         }
2336
2337                         cur += iosize;
2338                         pg_offset += iosize;
2339                         continue;
2340                 }
2341                 /* leave this out until we have a page_mkwrite call */
2342                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2343                                    EXTENT_DIRTY, 0, NULL)) {
2344                         cur = cur + iosize;
2345                         pg_offset += iosize;
2346                         continue;
2347                 }
2348
2349                 if (tree->ops && tree->ops->writepage_io_hook) {
2350                         ret = tree->ops->writepage_io_hook(page, cur,
2351                                                 cur + iosize - 1);
2352                 } else {
2353                         ret = 0;
2354                 }
2355                 if (ret) {
2356                         SetPageError(page);
2357                 } else {
2358                         unsigned long max_nr = end_index + 1;
2359
2360                         set_range_writeback(tree, cur, cur + iosize - 1);
2361                         if (!PageWriteback(page)) {
2362                                 printk(KERN_ERR "btrfs warning page %lu not "
2363                                        "writeback, cur %llu end %llu\n",
2364                                        page->index, (unsigned long long)cur,
2365                                        (unsigned long long)end);
2366                         }
2367
2368                         ret = submit_extent_page(write_flags, tree, page,
2369                                                  sector, iosize, pg_offset,
2370                                                  bdev, &epd->bio, max_nr,
2371                                                  end_bio_extent_writepage,
2372                                                  0, 0, 0);
2373                         if (ret)
2374                                 SetPageError(page);
2375                 }
2376                 cur = cur + iosize;
2377                 pg_offset += iosize;
2378                 nr++;
2379         }
2380 done:
2381         if (nr == 0) {
2382                 /* make sure the mapping tag for page dirty gets cleared */
2383                 set_page_writeback(page);
2384                 end_page_writeback(page);
2385         }
2386         unlock_page(page);
2387
2388 done_unlocked:
2389
2390         /* drop our reference on any cached states */
2391         free_extent_state(cached_state);
2392         return 0;
2393 }
2394
2395 /**
2396  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2397  * @mapping: address space structure to write
2398  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2399  * @writepage: function called for each page
2400  * @data: data passed to writepage function
2401  *
2402  * If a page is already under I/O, write_cache_pages() skips it, even
2403  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2404  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2405  * and msync() need to guarantee that all the data which was dirty at the time
2406  * the call was made get new I/O started against them.  If wbc->sync_mode is
2407  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2408  * existing IO to complete.
2409  */
2410 static int extent_write_cache_pages(struct extent_io_tree *tree,
2411                              struct address_space *mapping,
2412                              struct writeback_control *wbc,
2413                              writepage_t writepage, void *data,
2414                              void (*flush_fn)(void *))
2415 {
2416         int ret = 0;
2417         int done = 0;
2418         int nr_to_write_done = 0;
2419         struct pagevec pvec;
2420         int nr_pages;
2421         pgoff_t index;
2422         pgoff_t end;            /* Inclusive */
2423         int scanned = 0;
2424
2425         pagevec_init(&pvec, 0);
2426         if (wbc->range_cyclic) {
2427                 index = mapping->writeback_index; /* Start from prev offset */
2428                 end = -1;
2429         } else {
2430                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2431                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2432                 scanned = 1;
2433         }
2434 retry:
2435         while (!done && !nr_to_write_done && (index <= end) &&
2436                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2437                               PAGECACHE_TAG_DIRTY, min(end - index,
2438                                   (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2439                 unsigned i;
2440
2441                 scanned = 1;
2442                 for (i = 0; i < nr_pages; i++) {
2443                         struct page *page = pvec.pages[i];
2444
2445                         /*
2446                          * At this point we hold neither mapping->tree_lock nor
2447                          * lock on the page itself: the page may be truncated or
2448                          * invalidated (changing page->mapping to NULL), or even
2449                          * swizzled back from swapper_space to tmpfs file
2450                          * mapping
2451                          */
2452                         if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2453                                 tree->ops->write_cache_pages_lock_hook(page);
2454                         else
2455                                 lock_page(page);
2456
2457                         if (unlikely(page->mapping != mapping)) {
2458                                 unlock_page(page);
2459                                 continue;
2460                         }
2461
2462                         if (!wbc->range_cyclic && page->index > end) {
2463                                 done = 1;
2464                                 unlock_page(page);
2465                                 continue;
2466                         }
2467
2468                         if (wbc->sync_mode != WB_SYNC_NONE) {
2469                                 if (PageWriteback(page))
2470                                         flush_fn(data);
2471                                 wait_on_page_writeback(page);
2472                         }
2473
2474                         if (PageWriteback(page) ||
2475                             !clear_page_dirty_for_io(page)) {
2476                                 unlock_page(page);
2477                                 continue;
2478                         }
2479
2480                         ret = (*writepage)(page, wbc, data);
2481
2482                         if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2483                                 unlock_page(page);
2484                                 ret = 0;
2485                         }
2486                         if (ret)
2487                                 done = 1;
2488
2489                         /*
2490                          * the filesystem may choose to bump up nr_to_write.
2491                          * We have to make sure to honor the new nr_to_write
2492                          * at any time
2493                          */
2494                         nr_to_write_done = wbc->nr_to_write <= 0;
2495                 }
2496                 pagevec_release(&pvec);
2497                 cond_resched();
2498         }
2499         if (!scanned && !done) {
2500                 /*
2501                  * We hit the last page and there is more work to be done: wrap
2502                  * back to the start of the file
2503                  */
2504                 scanned = 1;
2505                 index = 0;
2506                 goto retry;
2507         }
2508         return ret;
2509 }
2510
2511 static void flush_epd_write_bio(struct extent_page_data *epd)
2512 {
2513         if (epd->bio) {
2514                 if (epd->sync_io)
2515                         submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2516                 else
2517                         submit_one_bio(WRITE, epd->bio, 0, 0);
2518                 epd->bio = NULL;
2519         }
2520 }
2521
2522 static noinline void flush_write_bio(void *data)
2523 {
2524         struct extent_page_data *epd = data;
2525         flush_epd_write_bio(epd);
2526 }
2527
2528 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2529                           get_extent_t *get_extent,
2530                           struct writeback_control *wbc)
2531 {
2532         int ret;
2533         struct address_space *mapping = page->mapping;
2534         struct extent_page_data epd = {
2535                 .bio = NULL,
2536                 .tree = tree,
2537                 .get_extent = get_extent,
2538                 .extent_locked = 0,
2539                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2540         };
2541         struct writeback_control wbc_writepages = {
2542                 .sync_mode      = wbc->sync_mode,
2543                 .older_than_this = NULL,
2544                 .nr_to_write    = 64,
2545                 .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
2546                 .range_end      = (loff_t)-1,
2547         };
2548
2549         ret = __extent_writepage(page, wbc, &epd);
2550
2551         extent_write_cache_pages(tree, mapping, &wbc_writepages,
2552                                  __extent_writepage, &epd, flush_write_bio);
2553         flush_epd_write_bio(&epd);
2554         return ret;
2555 }
2556
2557 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2558                               u64 start, u64 end, get_extent_t *get_extent,
2559                               int mode)
2560 {
2561         int ret = 0;
2562         struct address_space *mapping = inode->i_mapping;
2563         struct page *page;
2564         unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2565                 PAGE_CACHE_SHIFT;
2566
2567         struct extent_page_data epd = {
2568                 .bio = NULL,
2569                 .tree = tree,
2570                 .get_extent = get_extent,
2571                 .extent_locked = 1,
2572                 .sync_io = mode == WB_SYNC_ALL,
2573         };
2574         struct writeback_control wbc_writepages = {
2575                 .sync_mode      = mode,
2576                 .older_than_this = NULL,
2577                 .nr_to_write    = nr_pages * 2,
2578                 .range_start    = start,
2579                 .range_end      = end + 1,
2580         };
2581
2582         while (start <= end) {
2583                 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2584                 if (clear_page_dirty_for_io(page))
2585                         ret = __extent_writepage(page, &wbc_writepages, &epd);
2586                 else {
2587                         if (tree->ops && tree->ops->writepage_end_io_hook)
2588                                 tree->ops->writepage_end_io_hook(page, start,
2589                                                  start + PAGE_CACHE_SIZE - 1,
2590                                                  NULL, 1);
2591                         unlock_page(page);
2592                 }
2593                 page_cache_release(page);
2594                 start += PAGE_CACHE_SIZE;
2595         }
2596
2597         flush_epd_write_bio(&epd);
2598         return ret;
2599 }
2600
2601 int extent_writepages(struct extent_io_tree *tree,
2602                       struct address_space *mapping,
2603                       get_extent_t *get_extent,
2604                       struct writeback_control *wbc)
2605 {
2606         int ret = 0;
2607         struct extent_page_data epd = {
2608                 .bio = NULL,
2609                 .tree = tree,
2610                 .get_extent = get_extent,
2611                 .extent_locked = 0,
2612                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2613         };
2614
2615         ret = extent_write_cache_pages(tree, mapping, wbc,
2616                                        __extent_writepage, &epd,
2617                                        flush_write_bio);
2618         flush_epd_write_bio(&epd);
2619         return ret;
2620 }
2621
2622 int extent_readpages(struct extent_io_tree *tree,
2623                      struct address_space *mapping,
2624                      struct list_head *pages, unsigned nr_pages,
2625                      get_extent_t get_extent)
2626 {
2627         struct bio *bio = NULL;
2628         unsigned page_idx;
2629         unsigned long bio_flags = 0;
2630
2631         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2632                 struct page *page = list_entry(pages->prev, struct page, lru);
2633
2634                 prefetchw(&page->flags);
2635                 list_del(&page->lru);
2636                 if (!add_to_page_cache_lru(page, mapping,
2637                                         page->index, GFP_NOFS)) {
2638                         __extent_read_full_page(tree, page, get_extent,
2639                                                 &bio, 0, &bio_flags);
2640                 }
2641                 page_cache_release(page);
2642         }
2643         BUG_ON(!list_empty(pages));
2644         if (bio)
2645                 submit_one_bio(READ, bio, 0, bio_flags);
2646         return 0;
2647 }
2648
2649 /*
2650  * basic invalidatepage code, this waits on any locked or writeback
2651  * ranges corresponding to the page, and then deletes any extent state
2652  * records from the tree
2653  */
2654 int extent_invalidatepage(struct extent_io_tree *tree,
2655                           struct page *page, unsigned long offset)
2656 {
2657         struct extent_state *cached_state = NULL;
2658         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2659         u64 end = start + PAGE_CACHE_SIZE - 1;
2660         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2661
2662         start += (offset + blocksize - 1) & ~(blocksize - 1);
2663         if (start > end)
2664                 return 0;
2665
2666         lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2667         wait_on_page_writeback(page);
2668         clear_extent_bit(tree, start, end,
2669                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2670                          EXTENT_DO_ACCOUNTING,
2671                          1, 1, &cached_state, GFP_NOFS);
2672         return 0;
2673 }
2674
2675 /*
2676  * a helper for releasepage, this tests for areas of the page that
2677  * are locked or under IO and drops the related state bits if it is safe
2678  * to drop the page.
2679  */
2680 int try_release_extent_state(struct extent_map_tree *map,
2681                              struct extent_io_tree *tree, struct page *page,
2682                              gfp_t mask)
2683 {
2684         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2685         u64 end = start + PAGE_CACHE_SIZE - 1;
2686         int ret = 1;
2687
2688         if (test_range_bit(tree, start, end,
2689                            EXTENT_IOBITS, 0, NULL))
2690                 ret = 0;
2691         else {
2692                 if ((mask & GFP_NOFS) == GFP_NOFS)
2693                         mask = GFP_NOFS;
2694                 /*
2695                  * at this point we can safely clear everything except the
2696                  * locked bit and the nodatasum bit
2697                  */
2698                 ret = clear_extent_bit(tree, start, end,
2699                                  ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2700                                  0, 0, NULL, mask);
2701
2702                 /* if clear_extent_bit failed for enomem reasons,
2703                  * we can't allow the release to continue.
2704                  */
2705                 if (ret < 0)
2706                         ret = 0;
2707                 else
2708                         ret = 1;
2709         }
2710         return ret;
2711 }
2712
2713 /*
2714  * a helper for releasepage.  As long as there are no locked extents
2715  * in the range corresponding to the page, both state records and extent
2716  * map records are removed
2717  */
2718 int try_release_extent_mapping(struct extent_map_tree *map,
2719                                struct extent_io_tree *tree, struct page *page,
2720                                gfp_t mask)
2721 {
2722         struct extent_map *em;
2723         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2724         u64 end = start + PAGE_CACHE_SIZE - 1;
2725
2726         if ((mask & __GFP_WAIT) &&
2727             page->mapping->host->i_size > 16 * 1024 * 1024) {
2728                 u64 len;
2729                 while (start <= end) {
2730                         len = end - start + 1;
2731                         write_lock(&map->lock);
2732                         em = lookup_extent_mapping(map, start, len);
2733                         if (IS_ERR_OR_NULL(em)) {
2734                                 write_unlock(&map->lock);
2735                                 break;
2736                         }
2737                         if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2738                             em->start != start) {
2739                                 write_unlock(&map->lock);
2740                                 free_extent_map(em);
2741                                 break;
2742                         }
2743                         if (!test_range_bit(tree, em->start,
2744                                             extent_map_end(em) - 1,
2745                                             EXTENT_LOCKED | EXTENT_WRITEBACK,
2746                                             0, NULL)) {
2747                                 remove_extent_mapping(map, em);
2748                                 /* once for the rb tree */
2749                                 free_extent_map(em);
2750                         }
2751                         start = extent_map_end(em);
2752                         write_unlock(&map->lock);
2753
2754                         /* once for us */
2755                         free_extent_map(em);
2756                 }
2757         }
2758         return try_release_extent_state(map, tree, page, mask);
2759 }
2760
2761 /*
2762  * helper function for fiemap, which doesn't want to see any holes.
2763  * This maps until we find something past 'last'
2764  */
2765 static struct extent_map *get_extent_skip_holes(struct inode *inode,
2766                                                 u64 offset,
2767                                                 u64 last,
2768                                                 get_extent_t *get_extent)
2769 {
2770         u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
2771         struct extent_map *em;
2772         u64 len;
2773
2774         if (offset >= last)
2775                 return NULL;
2776
2777         while(1) {
2778                 len = last - offset;
2779                 if (len == 0)
2780                         break;
2781                 len = (len + sectorsize - 1) & ~(sectorsize - 1);
2782                 em = get_extent(inode, NULL, 0, offset, len, 0);
2783                 if (IS_ERR_OR_NULL(em))
2784                         return em;
2785
2786                 /* if this isn't a hole return it */
2787                 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
2788                     em->block_start != EXTENT_MAP_HOLE) {
2789                         return em;
2790                 }
2791
2792                 /* this is a hole, advance to the next extent */
2793                 offset = extent_map_end(em);
2794                 free_extent_map(em);
2795                 if (offset >= last)
2796                         break;
2797         }
2798         return NULL;
2799 }
2800
2801 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2802                 __u64 start, __u64 len, get_extent_t *get_extent)
2803 {
2804         int ret = 0;
2805         u64 off = start;
2806         u64 max = start + len;
2807         u32 flags = 0;
2808         u32 found_type;
2809         u64 last;
2810         u64 last_for_get_extent = 0;
2811         u64 disko = 0;
2812         u64 isize = i_size_read(inode);
2813         struct btrfs_key found_key;
2814         struct extent_map *em = NULL;
2815         struct extent_state *cached_state = NULL;
2816         struct btrfs_path *path;
2817         struct btrfs_file_extent_item *item;
2818         int end = 0;
2819         u64 em_start = 0;
2820         u64 em_len = 0;
2821         u64 em_end = 0;
2822         unsigned long emflags;
2823
2824         if (len == 0)
2825                 return -EINVAL;
2826
2827         path = btrfs_alloc_path();
2828         if (!path)
2829                 return -ENOMEM;
2830         path->leave_spinning = 1;
2831
2832         /*
2833          * lookup the last file extent.  We're not using i_size here
2834          * because there might be preallocation past i_size
2835          */
2836         ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
2837                                        path, btrfs_ino(inode), -1, 0);
2838         if (ret < 0) {
2839                 btrfs_free_path(path);
2840                 return ret;
2841         }
2842         WARN_ON(!ret);
2843         path->slots[0]--;
2844         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2845                               struct btrfs_file_extent_item);
2846         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
2847         found_type = btrfs_key_type(&found_key);
2848
2849         /* No extents, but there might be delalloc bits */
2850         if (found_key.objectid != btrfs_ino(inode) ||
2851             found_type != BTRFS_EXTENT_DATA_KEY) {
2852                 /* have to trust i_size as the end */
2853                 last = (u64)-1;
2854                 last_for_get_extent = isize;
2855         } else {
2856                 /*
2857                  * remember the start of the last extent.  There are a
2858                  * bunch of different factors that go into the length of the
2859                  * extent, so its much less complex to remember where it started
2860                  */
2861                 last = found_key.offset;
2862                 last_for_get_extent = last + 1;
2863         }
2864         btrfs_free_path(path);
2865
2866         /*
2867          * we might have some extents allocated but more delalloc past those
2868          * extents.  so, we trust isize unless the start of the last extent is
2869          * beyond isize
2870          */
2871         if (last < isize) {
2872                 last = (u64)-1;
2873                 last_for_get_extent = isize;
2874         }
2875
2876         lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2877                          &cached_state, GFP_NOFS);
2878
2879         em = get_extent_skip_holes(inode, off, last_for_get_extent,
2880                                    get_extent);
2881         if (!em)
2882                 goto out;
2883         if (IS_ERR(em)) {
2884                 ret = PTR_ERR(em);
2885                 goto out;
2886         }
2887
2888         while (!end) {
2889                 u64 offset_in_extent;
2890
2891                 /* break if the extent we found is outside the range */
2892                 if (em->start >= max || extent_map_end(em) < off)
2893                         break;
2894
2895                 /*
2896                  * get_extent may return an extent that starts before our
2897                  * requested range.  We have to make sure the ranges
2898                  * we return to fiemap always move forward and don't
2899                  * overlap, so adjust the offsets here
2900                  */
2901                 em_start = max(em->start, off);
2902
2903                 /*
2904                  * record the offset from the start of the extent
2905                  * for adjusting the disk offset below
2906                  */
2907                 offset_in_extent = em_start - em->start;
2908                 em_end = extent_map_end(em);
2909                 em_len = em_end - em_start;
2910                 emflags = em->flags;
2911                 disko = 0;
2912                 flags = 0;
2913
2914                 /*
2915                  * bump off for our next call to get_extent
2916                  */
2917                 off = extent_map_end(em);
2918                 if (off >= max)
2919                         end = 1;
2920
2921                 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2922                         end = 1;
2923                         flags |= FIEMAP_EXTENT_LAST;
2924                 } else if (em->block_start == EXTENT_MAP_INLINE) {
2925                         flags |= (FIEMAP_EXTENT_DATA_INLINE |
2926                                   FIEMAP_EXTENT_NOT_ALIGNED);
2927                 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
2928                         flags |= (FIEMAP_EXTENT_DELALLOC |
2929                                   FIEMAP_EXTENT_UNKNOWN);
2930                 } else {
2931                         disko = em->block_start + offset_in_extent;
2932                 }
2933                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2934                         flags |= FIEMAP_EXTENT_ENCODED;
2935
2936                 free_extent_map(em);
2937                 em = NULL;
2938                 if ((em_start >= last) || em_len == (u64)-1 ||
2939                    (last == (u64)-1 && isize <= em_end)) {
2940                         flags |= FIEMAP_EXTENT_LAST;
2941                         end = 1;
2942                 }
2943
2944                 /* now scan forward to see if this is really the last extent. */
2945                 em = get_extent_skip_holes(inode, off, last_for_get_extent,
2946                                            get_extent);
2947                 if (IS_ERR(em)) {
2948                         ret = PTR_ERR(em);
2949                         goto out;
2950                 }
2951                 if (!em) {
2952                         flags |= FIEMAP_EXTENT_LAST;
2953                         end = 1;
2954                 }
2955                 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2956                                               em_len, flags);
2957                 if (ret)
2958                         goto out_free;
2959         }
2960 out_free:
2961         free_extent_map(em);
2962 out:
2963         unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
2964                              &cached_state, GFP_NOFS);
2965         return ret;
2966 }
2967
2968 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2969                                               unsigned long i)
2970 {
2971         struct page *p;
2972         struct address_space *mapping;
2973
2974         if (i == 0)
2975                 return eb->first_page;
2976         i += eb->start >> PAGE_CACHE_SHIFT;
2977         mapping = eb->first_page->mapping;
2978         if (!mapping)
2979                 return NULL;
2980
2981         /*
2982          * extent_buffer_page is only called after pinning the page
2983          * by increasing the reference count.  So we know the page must
2984          * be in the radix tree.
2985          */
2986         rcu_read_lock();
2987         p = radix_tree_lookup(&mapping->page_tree, i);
2988         rcu_read_unlock();
2989
2990         return p;
2991 }
2992
2993 static inline unsigned long num_extent_pages(u64 start, u64 len)
2994 {
2995         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2996                 (start >> PAGE_CACHE_SHIFT);
2997 }
2998
2999 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3000                                                    u64 start,
3001                                                    unsigned long len,
3002                                                    gfp_t mask)
3003 {
3004         struct extent_buffer *eb = NULL;
3005 #if LEAK_DEBUG
3006         unsigned long flags;
3007 #endif
3008
3009         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3010         if (eb == NULL)
3011                 return NULL;
3012         eb->start = start;
3013         eb->len = len;
3014         spin_lock_init(&eb->lock);
3015         init_waitqueue_head(&eb->lock_wq);
3016
3017 #if LEAK_DEBUG
3018         spin_lock_irqsave(&leak_lock, flags);
3019         list_add(&eb->leak_list, &buffers);
3020         spin_unlock_irqrestore(&leak_lock, flags);
3021 #endif
3022         atomic_set(&eb->refs, 1);
3023
3024         return eb;
3025 }
3026
3027 static void __free_extent_buffer(struct extent_buffer *eb)
3028 {
3029 #if LEAK_DEBUG
3030         unsigned long flags;
3031         spin_lock_irqsave(&leak_lock, flags);
3032         list_del(&eb->leak_list);
3033         spin_unlock_irqrestore(&leak_lock, flags);
3034 #endif
3035         kmem_cache_free(extent_buffer_cache, eb);
3036 }
3037
3038 /*
3039  * Helper for releasing extent buffer page.
3040  */
3041 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3042                                                 unsigned long start_idx)
3043 {
3044         unsigned long index;
3045         struct page *page;
3046
3047         if (!eb->first_page)
3048                 return;
3049
3050         index = num_extent_pages(eb->start, eb->len);
3051         if (start_idx >= index)
3052                 return;
3053
3054         do {
3055                 index--;
3056                 page = extent_buffer_page(eb, index);
3057                 if (page)
3058                         page_cache_release(page);
3059         } while (index != start_idx);
3060 }
3061
3062 /*
3063  * Helper for releasing the extent buffer.
3064  */
3065 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3066 {
3067         btrfs_release_extent_buffer_page(eb, 0);
3068         __free_extent_buffer(eb);
3069 }
3070
3071 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3072                                           u64 start, unsigned long len,
3073                                           struct page *page0)
3074 {
3075         unsigned long num_pages = num_extent_pages(start, len);
3076         unsigned long i;
3077         unsigned long index = start >> PAGE_CACHE_SHIFT;
3078         struct extent_buffer *eb;
3079         struct extent_buffer *exists = NULL;
3080         struct page *p;
3081         struct address_space *mapping = tree->mapping;
3082         int uptodate = 1;
3083         int ret;
3084
3085         rcu_read_lock();
3086         eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3087         if (eb && atomic_inc_not_zero(&eb->refs)) {
3088                 rcu_read_unlock();
3089                 mark_page_accessed(eb->first_page);
3090                 return eb;
3091         }
3092         rcu_read_unlock();
3093
3094         eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
3095         if (!eb)
3096                 return NULL;
3097
3098         if (page0) {
3099                 eb->first_page = page0;
3100                 i = 1;
3101                 index++;
3102                 page_cache_get(page0);
3103                 mark_page_accessed(page0);
3104                 set_page_extent_mapped(page0);
3105                 set_page_extent_head(page0, len);
3106                 uptodate = PageUptodate(page0);
3107         } else {
3108                 i = 0;
3109         }
3110         for (; i < num_pages; i++, index++) {
3111                 p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM);
3112                 if (!p) {
3113                         WARN_ON(1);
3114                         goto free_eb;
3115                 }
3116                 set_page_extent_mapped(p);
3117                 mark_page_accessed(p);
3118                 if (i == 0) {
3119                         eb->first_page = p;
3120                         set_page_extent_head(p, len);
3121                 } else {
3122                         set_page_private(p, EXTENT_PAGE_PRIVATE);
3123                 }
3124                 if (!PageUptodate(p))
3125                         uptodate = 0;
3126
3127                 /*
3128                  * see below about how we avoid a nasty race with release page
3129                  * and why we unlock later
3130                  */
3131                 if (i != 0)
3132                         unlock_page(p);
3133         }
3134         if (uptodate)
3135                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3136
3137         ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3138         if (ret)
3139                 goto free_eb;
3140
3141         spin_lock(&tree->buffer_lock);
3142         ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
3143         if (ret == -EEXIST) {
3144                 exists = radix_tree_lookup(&tree->buffer,
3145                                                 start >> PAGE_CACHE_SHIFT);
3146                 /* add one reference for the caller */
3147                 atomic_inc(&exists->refs);
3148                 spin_unlock(&tree->buffer_lock);
3149                 radix_tree_preload_end();
3150                 goto free_eb;
3151         }
3152         /* add one reference for the tree */
3153         atomic_inc(&eb->refs);
3154         spin_unlock(&tree->buffer_lock);
3155         radix_tree_preload_end();
3156
3157         /*
3158          * there is a race where release page may have
3159          * tried to find this extent buffer in the radix
3160          * but failed.  It will tell the VM it is safe to
3161          * reclaim the, and it will clear the page private bit.
3162          * We must make sure to set the page private bit properly
3163          * after the extent buffer is in the radix tree so
3164          * it doesn't get lost
3165          */
3166         set_page_extent_mapped(eb->first_page);
3167         set_page_extent_head(eb->first_page, eb->len);
3168         if (!page0)
3169                 unlock_page(eb->first_page);
3170         return eb;
3171
3172 free_eb:
3173         if (eb->first_page && !page0)
3174                 unlock_page(eb->first_page);
3175
3176         if (!atomic_dec_and_test(&eb->refs))
3177                 return exists;
3178         btrfs_release_extent_buffer(eb);
3179         return exists;
3180 }
3181
3182 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3183                                          u64 start, unsigned long len)
3184 {
3185         struct extent_buffer *eb;
3186
3187         rcu_read_lock();
3188         eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3189         if (eb && atomic_inc_not_zero(&eb->refs)) {
3190                 rcu_read_unlock();
3191                 mark_page_accessed(eb->first_page);
3192                 return eb;
3193         }
3194         rcu_read_unlock();
3195
3196         return NULL;
3197 }
3198
3199 void free_extent_buffer(struct extent_buffer *eb)
3200 {
3201         if (!eb)
3202                 return;
3203
3204         if (!atomic_dec_and_test(&eb->refs))
3205                 return;
3206
3207         WARN_ON(1);
3208 }
3209
3210 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3211                               struct extent_buffer *eb)
3212 {
3213         unsigned long i;
3214         unsigned long num_pages;
3215         struct page *page;
3216
3217         num_pages = num_extent_pages(eb->start, eb->len);
3218
3219         for (i = 0; i < num_pages; i++) {
3220                 page = extent_buffer_page(eb, i);
3221                 if (!PageDirty(page))
3222                         continue;
3223
3224                 lock_page(page);
3225                 WARN_ON(!PagePrivate(page));
3226
3227                 set_page_extent_mapped(page);
3228                 if (i == 0)
3229                         set_page_extent_head(page, eb->len);
3230
3231                 clear_page_dirty_for_io(page);
3232                 spin_lock_irq(&page->mapping->tree_lock);
3233                 if (!PageDirty(page)) {
3234                         radix_tree_tag_clear(&page->mapping->page_tree,
3235                                                 page_index(page),
3236                                                 PAGECACHE_TAG_DIRTY);
3237                 }
3238                 spin_unlock_irq(&page->mapping->tree_lock);
3239                 unlock_page(page);
3240         }
3241         return 0;
3242 }
3243
3244 int set_extent_buffer_dirty(struct extent_io_tree *tree,
3245                              struct extent_buffer *eb)
3246 {
3247         unsigned long i;
3248         unsigned long num_pages;
3249         int was_dirty = 0;
3250
3251         was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3252         num_pages = num_extent_pages(eb->start, eb->len);
3253         for (i = 0; i < num_pages; i++)
3254                 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3255         return was_dirty;
3256 }
3257
3258 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3259                                 struct extent_buffer *eb,
3260                                 struct extent_state **cached_state)
3261 {
3262         unsigned long i;
3263         struct page *page;
3264         unsigned long num_pages;
3265
3266         num_pages = num_extent_pages(eb->start, eb->len);
3267         clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3268
3269         clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3270                               cached_state, GFP_NOFS);
3271         for (i = 0; i < num_pages; i++) {
3272                 page = extent_buffer_page(eb, i);
3273                 if (page)
3274                         ClearPageUptodate(page);
3275         }
3276         return 0;
3277 }
3278
3279 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3280                                 struct extent_buffer *eb)
3281 {
3282         unsigned long i;
3283         struct page *page;
3284         unsigned long num_pages;
3285
3286         num_pages = num_extent_pages(eb->start, eb->len);
3287
3288         set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3289                             NULL, GFP_NOFS);
3290         for (i = 0; i < num_pages; i++) {
3291                 page = extent_buffer_page(eb, i);
3292                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3293                     ((i == num_pages - 1) &&
3294                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3295                         check_page_uptodate(tree, page);
3296                         continue;
3297                 }
3298                 SetPageUptodate(page);
3299         }
3300         return 0;
3301 }
3302
3303 int extent_range_uptodate(struct extent_io_tree *tree,
3304                           u64 start, u64 end)
3305 {
3306         struct page *page;
3307         int ret;
3308         int pg_uptodate = 1;
3309         int uptodate;
3310         unsigned long index;
3311
3312         ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
3313         if (ret)
3314                 return 1;
3315         while (start <= end) {
3316                 index = start >> PAGE_CACHE_SHIFT;
3317                 page = find_get_page(tree->mapping, index);
3318                 uptodate = PageUptodate(page);
3319                 page_cache_release(page);
3320                 if (!uptodate) {
3321                         pg_uptodate = 0;
3322                         break;
3323                 }
3324                 start += PAGE_CACHE_SIZE;
3325         }
3326         return pg_uptodate;
3327 }
3328
3329 int extent_buffer_uptodate(struct extent_io_tree *tree,
3330                            struct extent_buffer *eb,
3331                            struct extent_state *cached_state)
3332 {
3333         int ret = 0;
3334         unsigned long num_pages;
3335         unsigned long i;
3336         struct page *page;
3337         int pg_uptodate = 1;
3338
3339         if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3340                 return 1;
3341
3342         ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3343                            EXTENT_UPTODATE, 1, cached_state);
3344         if (ret)
3345                 return ret;
3346
3347         num_pages = num_extent_pages(eb->start, eb->len);
3348         for (i = 0; i < num_pages; i++) {
3349                 page = extent_buffer_page(eb, i);
3350                 if (!PageUptodate(page)) {
3351                         pg_uptodate = 0;
3352                         break;
3353                 }
3354         }
3355         return pg_uptodate;
3356 }
3357
3358 int read_extent_buffer_pages(struct extent_io_tree *tree,
3359                              struct extent_buffer *eb,
3360                              u64 start, int wait,
3361                              get_extent_t *get_extent, int mirror_num)
3362 {
3363         unsigned long i;
3364         unsigned long start_i;
3365         struct page *page;
3366         int err;
3367         int ret = 0;
3368         int locked_pages = 0;
3369         int all_uptodate = 1;
3370         int inc_all_pages = 0;
3371         unsigned long num_pages;
3372         struct bio *bio = NULL;
3373         unsigned long bio_flags = 0;
3374
3375         if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3376                 return 0;
3377
3378         if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3379                            EXTENT_UPTODATE, 1, NULL)) {
3380                 return 0;
3381         }
3382
3383         if (start) {
3384                 WARN_ON(start < eb->start);
3385                 start_i = (start >> PAGE_CACHE_SHIFT) -
3386                         (eb->start >> PAGE_CACHE_SHIFT);
3387         } else {
3388                 start_i = 0;
3389         }
3390
3391         num_pages = num_extent_pages(eb->start, eb->len);
3392         for (i = start_i; i < num_pages; i++) {
3393                 page = extent_buffer_page(eb, i);
3394                 if (!wait) {
3395                         if (!trylock_page(page))
3396                                 goto unlock_exit;
3397                 } else {
3398                         lock_page(page);
3399                 }
3400                 locked_pages++;
3401                 if (!PageUptodate(page))
3402                         all_uptodate = 0;
3403         }
3404         if (all_uptodate) {
3405                 if (start_i == 0)
3406                         set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3407                 goto unlock_exit;
3408         }
3409
3410         for (i = start_i; i < num_pages; i++) {
3411                 page = extent_buffer_page(eb, i);
3412
3413                 WARN_ON(!PagePrivate(page));
3414
3415                 set_page_extent_mapped(page);
3416                 if (i == 0)
3417                         set_page_extent_head(page, eb->len);
3418
3419                 if (inc_all_pages)
3420                         page_cache_get(page);
3421                 if (!PageUptodate(page)) {
3422                         if (start_i == 0)
3423                                 inc_all_pages = 1;
3424                         ClearPageError(page);
3425                         err = __extent_read_full_page(tree, page,
3426                                                       get_extent, &bio,
3427                                                       mirror_num, &bio_flags);
3428                         if (err)
3429                                 ret = err;
3430                 } else {
3431                         unlock_page(page);
3432                 }
3433         }
3434
3435         if (bio)
3436                 submit_one_bio(READ, bio, mirror_num, bio_flags);
3437
3438         if (ret || !wait)
3439                 return ret;
3440
3441         for (i = start_i; i < num_pages; i++) {
3442                 page = extent_buffer_page(eb, i);
3443                 wait_on_page_locked(page);
3444                 if (!PageUptodate(page))
3445                         ret = -EIO;
3446         }
3447
3448         if (!ret)
3449                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3450         return ret;
3451
3452 unlock_exit:
3453         i = start_i;
3454         while (locked_pages > 0) {
3455                 page = extent_buffer_page(eb, i);
3456                 i++;
3457                 unlock_page(page);
3458                 locked_pages--;
3459         }
3460         return ret;
3461 }
3462
3463 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3464                         unsigned long start,
3465                         unsigned long len)
3466 {
3467         size_t cur;
3468         size_t offset;
3469         struct page *page;
3470         char *kaddr;
3471         char *dst = (char *)dstv;
3472         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3473         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3474
3475         WARN_ON(start > eb->len);
3476         WARN_ON(start + len > eb->start + eb->len);
3477
3478         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3479
3480         while (len > 0) {
3481                 page = extent_buffer_page(eb, i);
3482
3483                 cur = min(len, (PAGE_CACHE_SIZE - offset));
3484                 kaddr = kmap_atomic(page, KM_USER1);
3485                 memcpy(dst, kaddr + offset, cur);
3486                 kunmap_atomic(kaddr, KM_USER1);
3487
3488                 dst += cur;
3489                 len -= cur;
3490                 offset = 0;
3491                 i++;
3492         }
3493 }
3494
3495 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3496                                unsigned long min_len, char **token, char **map,
3497                                unsigned long *map_start,
3498                                unsigned long *map_len, int km)
3499 {
3500         size_t offset = start & (PAGE_CACHE_SIZE - 1);
3501         char *kaddr;
3502         struct page *p;
3503         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3504         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3505         unsigned long end_i = (start_offset + start + min_len - 1) >>
3506                 PAGE_CACHE_SHIFT;
3507
3508         if (i != end_i)
3509                 return -EINVAL;
3510
3511         if (i == 0) {
3512                 offset = start_offset;
3513                 *map_start = 0;
3514         } else {
3515                 offset = 0;
3516                 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3517         }
3518
3519         if (start + min_len > eb->len) {
3520                 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3521                        "wanted %lu %lu\n", (unsigned long long)eb->start,
3522                        eb->len, start, min_len);
3523                 WARN_ON(1);
3524                 return -EINVAL;
3525         }
3526
3527         p = extent_buffer_page(eb, i);
3528         kaddr = kmap_atomic(p, km);
3529         *token = kaddr;
3530         *map = kaddr + offset;
3531         *map_len = PAGE_CACHE_SIZE - offset;
3532         return 0;
3533 }
3534
3535 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3536                       unsigned long min_len,
3537                       char **token, char **map,
3538                       unsigned long *map_start,
3539                       unsigned long *map_len, int km)
3540 {
3541         int err;
3542         int save = 0;
3543         if (eb->map_token) {
3544                 unmap_extent_buffer(eb, eb->map_token, km);
3545                 eb->map_token = NULL;
3546                 save = 1;
3547         }
3548         err = map_private_extent_buffer(eb, start, min_len, token, map,
3549                                        map_start, map_len, km);
3550         if (!err && save) {
3551                 eb->map_token = *token;
3552                 eb->kaddr = *map;
3553                 eb->map_start = *map_start;
3554                 eb->map_len = *map_len;
3555         }
3556         return err;
3557 }
3558
3559 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3560 {
3561         kunmap_atomic(token, km);
3562 }
3563
3564 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3565                           unsigned long start,
3566                           unsigned long len)
3567 {
3568         size_t cur;
3569         size_t offset;
3570         struct page *page;
3571         char *kaddr;
3572         char *ptr = (char *)ptrv;
3573         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3574         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3575         int ret = 0;
3576
3577         WARN_ON(start > eb->len);
3578         WARN_ON(start + len > eb->start + eb->len);
3579
3580         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3581
3582         while (len > 0) {
3583                 page = extent_buffer_page(eb, i);
3584
3585                 cur = min(len, (PAGE_CACHE_SIZE - offset));
3586
3587                 kaddr = kmap_atomic(page, KM_USER0);
3588                 ret = memcmp(ptr, kaddr + offset, cur);
3589                 kunmap_atomic(kaddr, KM_USER0);
3590                 if (ret)
3591                         break;
3592
3593                 ptr += cur;
3594                 len -= cur;
3595                 offset = 0;
3596                 i++;
3597         }
3598         return ret;
3599 }
3600
3601 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3602                          unsigned long start, unsigned long len)
3603 {
3604         size_t cur;
3605         size_t offset;
3606         struct page *page;
3607         char *kaddr;
3608         char *src = (char *)srcv;
3609         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3610         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3611
3612         WARN_ON(start > eb->len);
3613         WARN_ON(start + len > eb->start + eb->len);
3614
3615         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3616
3617         while (len > 0) {
3618                 page = extent_buffer_page(eb, i);
3619                 WARN_ON(!PageUptodate(page));
3620
3621                 cur = min(len, PAGE_CACHE_SIZE - offset);
3622                 kaddr = kmap_atomic(page, KM_USER1);
3623                 memcpy(kaddr + offset, src, cur);
3624                 kunmap_atomic(kaddr, KM_USER1);
3625
3626                 src += cur;
3627                 len -= cur;
3628                 offset = 0;
3629                 i++;
3630         }
3631 }
3632
3633 void memset_extent_buffer(struct extent_buffer *eb, char c,
3634                           unsigned long start, unsigned long len)
3635 {
3636         size_t cur;
3637         size_t offset;
3638         struct page *page;
3639         char *kaddr;
3640         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3641         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3642
3643         WARN_ON(start > eb->len);
3644         WARN_ON(start + len > eb->start + eb->len);
3645
3646         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3647
3648         while (len > 0) {
3649                 page = extent_buffer_page(eb, i);
3650                 WARN_ON(!PageUptodate(page));
3651
3652                 cur = min(len, PAGE_CACHE_SIZE - offset);
3653                 kaddr = kmap_atomic(page, KM_USER0);
3654                 memset(kaddr + offset, c, cur);
3655                 kunmap_atomic(kaddr, KM_USER0);
3656
3657                 len -= cur;
3658                 offset = 0;
3659                 i++;
3660         }
3661 }
3662
3663 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3664                         unsigned long dst_offset, unsigned long src_offset,
3665                         unsigned long len)
3666 {
3667         u64 dst_len = dst->len;
3668         size_t cur;
3669         size_t offset;
3670         struct page *page;
3671         char *kaddr;
3672         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3673         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3674
3675         WARN_ON(src->len != dst_len);
3676
3677         offset = (start_offset + dst_offset) &
3678                 ((unsigned long)PAGE_CACHE_SIZE - 1);
3679
3680         while (len > 0) {
3681                 page = extent_buffer_page(dst, i);
3682                 WARN_ON(!PageUptodate(page));
3683
3684                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3685
3686                 kaddr = kmap_atomic(page, KM_USER0);
3687                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3688                 kunmap_atomic(kaddr, KM_USER0);
3689
3690                 src_offset += cur;
3691                 len -= cur;
3692                 offset = 0;
3693                 i++;
3694         }
3695 }
3696
3697 static void move_pages(struct page *dst_page, struct page *src_page,
3698                        unsigned long dst_off, unsigned long src_off,
3699                        unsigned long len)
3700 {
3701         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3702         if (dst_page == src_page) {
3703                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3704         } else {
3705                 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3706                 char *p = dst_kaddr + dst_off + len;
3707                 char *s = src_kaddr + src_off + len;
3708
3709                 while (len--)
3710                         *--p = *--s;
3711
3712                 kunmap_atomic(src_kaddr, KM_USER1);
3713         }
3714         kunmap_atomic(dst_kaddr, KM_USER0);
3715 }
3716
3717 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
3718 {
3719         unsigned long distance = (src > dst) ? src - dst : dst - src;
3720         return distance < len;
3721 }
3722
3723 static void copy_pages(struct page *dst_page, struct page *src_page,
3724                        unsigned long dst_off, unsigned long src_off,
3725                        unsigned long len)
3726 {
3727         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3728         char *src_kaddr;
3729
3730         if (dst_page != src_page) {
3731                 src_kaddr = kmap_atomic(src_page, KM_USER1);
3732         } else {
3733                 src_kaddr = dst_kaddr;
3734                 BUG_ON(areas_overlap(src_off, dst_off, len));
3735         }
3736
3737         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3738         kunmap_atomic(dst_kaddr, KM_USER0);
3739         if (dst_page != src_page)
3740                 kunmap_atomic(src_kaddr, KM_USER1);
3741 }
3742
3743 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3744                            unsigned long src_offset, unsigned long len)
3745 {
3746         size_t cur;
3747         size_t dst_off_in_page;
3748         size_t src_off_in_page;
3749         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3750         unsigned long dst_i;
3751         unsigned long src_i;
3752
3753         if (src_offset + len > dst->len) {
3754                 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3755                        "len %lu dst len %lu\n", src_offset, len, dst->len);
3756                 BUG_ON(1);
3757         }
3758         if (dst_offset + len > dst->len) {
3759                 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3760                        "len %lu dst len %lu\n", dst_offset, len, dst->len);
3761                 BUG_ON(1);
3762         }
3763
3764         while (len > 0) {
3765                 dst_off_in_page = (start_offset + dst_offset) &
3766                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3767                 src_off_in_page = (start_offset + src_offset) &
3768                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3769
3770                 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3771                 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3772
3773                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3774                                                src_off_in_page));
3775                 cur = min_t(unsigned long, cur,
3776                         (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3777
3778                 copy_pages(extent_buffer_page(dst, dst_i),
3779                            extent_buffer_page(dst, src_i),
3780                            dst_off_in_page, src_off_in_page, cur);
3781
3782                 src_offset += cur;
3783                 dst_offset += cur;
3784                 len -= cur;
3785         }
3786 }
3787
3788 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3789                            unsigned long src_offset, unsigned long len)
3790 {
3791         size_t cur;
3792         size_t dst_off_in_page;
3793         size_t src_off_in_page;
3794         unsigned long dst_end = dst_offset + len - 1;
3795         unsigned long src_end = src_offset + len - 1;
3796         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3797         unsigned long dst_i;
3798         unsigned long src_i;
3799
3800         if (src_offset + len > dst->len) {
3801                 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3802                        "len %lu len %lu\n", src_offset, len, dst->len);
3803                 BUG_ON(1);
3804         }
3805         if (dst_offset + len > dst->len) {
3806                 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3807                        "len %lu len %lu\n", dst_offset, len, dst->len);
3808                 BUG_ON(1);
3809         }
3810         if (!areas_overlap(src_offset, dst_offset, len)) {
3811                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3812                 return;
3813         }
3814         while (len > 0) {
3815                 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3816                 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3817
3818                 dst_off_in_page = (start_offset + dst_end) &
3819                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3820                 src_off_in_page = (start_offset + src_end) &
3821                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3822
3823                 cur = min_t(unsigned long, len, src_off_in_page + 1);
3824                 cur = min(cur, dst_off_in_page + 1);
3825                 move_pages(extent_buffer_page(dst, dst_i),
3826                            extent_buffer_page(dst, src_i),
3827                            dst_off_in_page - cur + 1,
3828                            src_off_in_page - cur + 1, cur);
3829
3830                 dst_end -= cur;
3831                 src_end -= cur;
3832                 len -= cur;
3833         }
3834 }
3835
3836 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
3837 {
3838         struct extent_buffer *eb =
3839                         container_of(head, struct extent_buffer, rcu_head);
3840
3841         btrfs_release_extent_buffer(eb);
3842 }
3843
3844 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3845 {
3846         u64 start = page_offset(page);
3847         struct extent_buffer *eb;
3848         int ret = 1;
3849
3850         spin_lock(&tree->buffer_lock);
3851         eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3852         if (!eb) {
3853                 spin_unlock(&tree->buffer_lock);
3854                 return ret;
3855         }
3856
3857         if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3858                 ret = 0;
3859                 goto out;
3860         }
3861
3862         /*
3863          * set @eb->refs to 0 if it is already 1, and then release the @eb.
3864          * Or go back.
3865          */
3866         if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
3867                 ret = 0;
3868                 goto out;
3869         }
3870
3871         radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3872 out:
3873         spin_unlock(&tree->buffer_lock);
3874
3875         /* at this point we can safely release the extent buffer */
3876         if (atomic_read(&eb->refs) == 0)
3877                 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
3878         return ret;
3879 }