Btrfs: fix deadlock with nested trans handles
[pandora-kernel.git] / fs / btrfs / extent_io.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/pagemap.h>
6 #include <linux/page-flags.h>
7 #include <linux/module.h>
8 #include <linux/spinlock.h>
9 #include <linux/blkdev.h>
10 #include <linux/swap.h>
11 #include <linux/writeback.h>
12 #include <linux/pagevec.h>
13 #include <linux/prefetch.h>
14 #include <linux/cleancache.h>
15 #include "extent_io.h"
16 #include "extent_map.h"
17 #include "compat.h"
18 #include "ctree.h"
19 #include "btrfs_inode.h"
20 #include "volumes.h"
21
22 static struct kmem_cache *extent_state_cache;
23 static struct kmem_cache *extent_buffer_cache;
24
25 static LIST_HEAD(buffers);
26 static LIST_HEAD(states);
27
28 #define LEAK_DEBUG 0
29 #if LEAK_DEBUG
30 static DEFINE_SPINLOCK(leak_lock);
31 #endif
32
33 #define BUFFER_LRU_MAX 64
34
35 struct tree_entry {
36         u64 start;
37         u64 end;
38         struct rb_node rb_node;
39 };
40
41 struct extent_page_data {
42         struct bio *bio;
43         struct extent_io_tree *tree;
44         get_extent_t *get_extent;
45
46         /* tells writepage not to lock the state bits for this range
47          * it still does the unlocking
48          */
49         unsigned int extent_locked:1;
50
51         /* tells the submit_bio code to use a WRITE_SYNC */
52         unsigned int sync_io:1;
53 };
54
55 int __init extent_io_init(void)
56 {
57         extent_state_cache = kmem_cache_create("extent_state",
58                         sizeof(struct extent_state), 0,
59                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
60         if (!extent_state_cache)
61                 return -ENOMEM;
62
63         extent_buffer_cache = kmem_cache_create("extent_buffers",
64                         sizeof(struct extent_buffer), 0,
65                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
66         if (!extent_buffer_cache)
67                 goto free_state_cache;
68         return 0;
69
70 free_state_cache:
71         kmem_cache_destroy(extent_state_cache);
72         return -ENOMEM;
73 }
74
75 void extent_io_exit(void)
76 {
77         struct extent_state *state;
78         struct extent_buffer *eb;
79
80         while (!list_empty(&states)) {
81                 state = list_entry(states.next, struct extent_state, leak_list);
82                 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
83                        "state %lu in tree %p refs %d\n",
84                        (unsigned long long)state->start,
85                        (unsigned long long)state->end,
86                        state->state, state->tree, atomic_read(&state->refs));
87                 list_del(&state->leak_list);
88                 kmem_cache_free(extent_state_cache, state);
89
90         }
91
92         while (!list_empty(&buffers)) {
93                 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
94                 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
95                        "refs %d\n", (unsigned long long)eb->start,
96                        eb->len, atomic_read(&eb->refs));
97                 list_del(&eb->leak_list);
98                 kmem_cache_free(extent_buffer_cache, eb);
99         }
100         if (extent_state_cache)
101                 kmem_cache_destroy(extent_state_cache);
102         if (extent_buffer_cache)
103                 kmem_cache_destroy(extent_buffer_cache);
104 }
105
106 void extent_io_tree_init(struct extent_io_tree *tree,
107                          struct address_space *mapping)
108 {
109         tree->state = RB_ROOT;
110         INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
111         tree->ops = NULL;
112         tree->dirty_bytes = 0;
113         spin_lock_init(&tree->lock);
114         spin_lock_init(&tree->buffer_lock);
115         tree->mapping = mapping;
116 }
117
118 static struct extent_state *alloc_extent_state(gfp_t mask)
119 {
120         struct extent_state *state;
121 #if LEAK_DEBUG
122         unsigned long flags;
123 #endif
124
125         state = kmem_cache_alloc(extent_state_cache, mask);
126         if (!state)
127                 return state;
128         state->state = 0;
129         state->private = 0;
130         state->tree = NULL;
131 #if LEAK_DEBUG
132         spin_lock_irqsave(&leak_lock, flags);
133         list_add(&state->leak_list, &states);
134         spin_unlock_irqrestore(&leak_lock, flags);
135 #endif
136         atomic_set(&state->refs, 1);
137         init_waitqueue_head(&state->wq);
138         return state;
139 }
140
141 void free_extent_state(struct extent_state *state)
142 {
143         if (!state)
144                 return;
145         if (atomic_dec_and_test(&state->refs)) {
146 #if LEAK_DEBUG
147                 unsigned long flags;
148 #endif
149                 WARN_ON(state->tree);
150 #if LEAK_DEBUG
151                 spin_lock_irqsave(&leak_lock, flags);
152                 list_del(&state->leak_list);
153                 spin_unlock_irqrestore(&leak_lock, flags);
154 #endif
155                 kmem_cache_free(extent_state_cache, state);
156         }
157 }
158
159 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
160                                    struct rb_node *node)
161 {
162         struct rb_node **p = &root->rb_node;
163         struct rb_node *parent = NULL;
164         struct tree_entry *entry;
165
166         while (*p) {
167                 parent = *p;
168                 entry = rb_entry(parent, struct tree_entry, rb_node);
169
170                 if (offset < entry->start)
171                         p = &(*p)->rb_left;
172                 else if (offset > entry->end)
173                         p = &(*p)->rb_right;
174                 else
175                         return parent;
176         }
177
178         entry = rb_entry(node, struct tree_entry, rb_node);
179         rb_link_node(node, parent, p);
180         rb_insert_color(node, root);
181         return NULL;
182 }
183
184 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
185                                      struct rb_node **prev_ret,
186                                      struct rb_node **next_ret)
187 {
188         struct rb_root *root = &tree->state;
189         struct rb_node *n = root->rb_node;
190         struct rb_node *prev = NULL;
191         struct rb_node *orig_prev = NULL;
192         struct tree_entry *entry;
193         struct tree_entry *prev_entry = NULL;
194
195         while (n) {
196                 entry = rb_entry(n, struct tree_entry, rb_node);
197                 prev = n;
198                 prev_entry = entry;
199
200                 if (offset < entry->start)
201                         n = n->rb_left;
202                 else if (offset > entry->end)
203                         n = n->rb_right;
204                 else
205                         return n;
206         }
207
208         if (prev_ret) {
209                 orig_prev = prev;
210                 while (prev && offset > prev_entry->end) {
211                         prev = rb_next(prev);
212                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
213                 }
214                 *prev_ret = prev;
215                 prev = orig_prev;
216         }
217
218         if (next_ret) {
219                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
220                 while (prev && offset < prev_entry->start) {
221                         prev = rb_prev(prev);
222                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
223                 }
224                 *next_ret = prev;
225         }
226         return NULL;
227 }
228
229 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
230                                           u64 offset)
231 {
232         struct rb_node *prev = NULL;
233         struct rb_node *ret;
234
235         ret = __etree_search(tree, offset, &prev, NULL);
236         if (!ret)
237                 return prev;
238         return ret;
239 }
240
241 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
242                      struct extent_state *other)
243 {
244         if (tree->ops && tree->ops->merge_extent_hook)
245                 tree->ops->merge_extent_hook(tree->mapping->host, new,
246                                              other);
247 }
248
249 /*
250  * utility function to look for merge candidates inside a given range.
251  * Any extents with matching state are merged together into a single
252  * extent in the tree.  Extents with EXTENT_IO in their state field
253  * are not merged because the end_io handlers need to be able to do
254  * operations on them without sleeping (or doing allocations/splits).
255  *
256  * This should be called with the tree lock held.
257  */
258 static void merge_state(struct extent_io_tree *tree,
259                         struct extent_state *state)
260 {
261         struct extent_state *other;
262         struct rb_node *other_node;
263
264         if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
265                 return;
266
267         other_node = rb_prev(&state->rb_node);
268         if (other_node) {
269                 other = rb_entry(other_node, struct extent_state, rb_node);
270                 if (other->end == state->start - 1 &&
271                     other->state == state->state) {
272                         merge_cb(tree, state, other);
273                         state->start = other->start;
274                         other->tree = NULL;
275                         rb_erase(&other->rb_node, &tree->state);
276                         free_extent_state(other);
277                 }
278         }
279         other_node = rb_next(&state->rb_node);
280         if (other_node) {
281                 other = rb_entry(other_node, struct extent_state, rb_node);
282                 if (other->start == state->end + 1 &&
283                     other->state == state->state) {
284                         merge_cb(tree, state, other);
285                         state->end = other->end;
286                         other->tree = NULL;
287                         rb_erase(&other->rb_node, &tree->state);
288                         free_extent_state(other);
289                 }
290         }
291 }
292
293 static void set_state_cb(struct extent_io_tree *tree,
294                          struct extent_state *state, int *bits)
295 {
296         if (tree->ops && tree->ops->set_bit_hook)
297                 tree->ops->set_bit_hook(tree->mapping->host, state, bits);
298 }
299
300 static void clear_state_cb(struct extent_io_tree *tree,
301                            struct extent_state *state, int *bits)
302 {
303         if (tree->ops && tree->ops->clear_bit_hook)
304                 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
305 }
306
307 static void set_state_bits(struct extent_io_tree *tree,
308                            struct extent_state *state, int *bits);
309
310 /*
311  * insert an extent_state struct into the tree.  'bits' are set on the
312  * struct before it is inserted.
313  *
314  * This may return -EEXIST if the extent is already there, in which case the
315  * state struct is freed.
316  *
317  * The tree lock is not taken internally.  This is a utility function and
318  * probably isn't what you want to call (see set/clear_extent_bit).
319  */
320 static int insert_state(struct extent_io_tree *tree,
321                         struct extent_state *state, u64 start, u64 end,
322                         int *bits)
323 {
324         struct rb_node *node;
325
326         if (end < start) {
327                 printk(KERN_ERR "btrfs end < start %llu %llu\n",
328                        (unsigned long long)end,
329                        (unsigned long long)start);
330                 WARN_ON(1);
331         }
332         state->start = start;
333         state->end = end;
334
335         set_state_bits(tree, state, bits);
336
337         node = tree_insert(&tree->state, end, &state->rb_node);
338         if (node) {
339                 struct extent_state *found;
340                 found = rb_entry(node, struct extent_state, rb_node);
341                 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
342                        "%llu %llu\n", (unsigned long long)found->start,
343                        (unsigned long long)found->end,
344                        (unsigned long long)start, (unsigned long long)end);
345                 return -EEXIST;
346         }
347         state->tree = tree;
348         merge_state(tree, state);
349         return 0;
350 }
351
352 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
353                      u64 split)
354 {
355         if (tree->ops && tree->ops->split_extent_hook)
356                 tree->ops->split_extent_hook(tree->mapping->host, orig, split);
357 }
358
359 /*
360  * split a given extent state struct in two, inserting the preallocated
361  * struct 'prealloc' as the newly created second half.  'split' indicates an
362  * offset inside 'orig' where it should be split.
363  *
364  * Before calling,
365  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
366  * are two extent state structs in the tree:
367  * prealloc: [orig->start, split - 1]
368  * orig: [ split, orig->end ]
369  *
370  * The tree locks are not taken by this function. They need to be held
371  * by the caller.
372  */
373 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
374                        struct extent_state *prealloc, u64 split)
375 {
376         struct rb_node *node;
377
378         split_cb(tree, orig, split);
379
380         prealloc->start = orig->start;
381         prealloc->end = split - 1;
382         prealloc->state = orig->state;
383         orig->start = split;
384
385         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
386         if (node) {
387                 free_extent_state(prealloc);
388                 return -EEXIST;
389         }
390         prealloc->tree = tree;
391         return 0;
392 }
393
394 /*
395  * utility function to clear some bits in an extent state struct.
396  * it will optionally wake up any one waiting on this state (wake == 1), or
397  * forcibly remove the state from the tree (delete == 1).
398  *
399  * If no bits are set on the state struct after clearing things, the
400  * struct is freed and removed from the tree
401  */
402 static int clear_state_bit(struct extent_io_tree *tree,
403                             struct extent_state *state,
404                             int *bits, int wake)
405 {
406         int bits_to_clear = *bits & ~EXTENT_CTLBITS;
407         int ret = state->state & bits_to_clear;
408
409         if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
410                 u64 range = state->end - state->start + 1;
411                 WARN_ON(range > tree->dirty_bytes);
412                 tree->dirty_bytes -= range;
413         }
414         clear_state_cb(tree, state, bits);
415         state->state &= ~bits_to_clear;
416         if (wake)
417                 wake_up(&state->wq);
418         if (state->state == 0) {
419                 if (state->tree) {
420                         rb_erase(&state->rb_node, &tree->state);
421                         state->tree = NULL;
422                         free_extent_state(state);
423                 } else {
424                         WARN_ON(1);
425                 }
426         } else {
427                 merge_state(tree, state);
428         }
429         return ret;
430 }
431
432 static struct extent_state *
433 alloc_extent_state_atomic(struct extent_state *prealloc)
434 {
435         if (!prealloc)
436                 prealloc = alloc_extent_state(GFP_ATOMIC);
437
438         return prealloc;
439 }
440
441 /*
442  * clear some bits on a range in the tree.  This may require splitting
443  * or inserting elements in the tree, so the gfp mask is used to
444  * indicate which allocations or sleeping are allowed.
445  *
446  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
447  * the given range from the tree regardless of state (ie for truncate).
448  *
449  * the range [start, end] is inclusive.
450  *
451  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
452  * bits were already set, or zero if none of the bits were already set.
453  */
454 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
455                      int bits, int wake, int delete,
456                      struct extent_state **cached_state,
457                      gfp_t mask)
458 {
459         struct extent_state *state;
460         struct extent_state *cached;
461         struct extent_state *prealloc = NULL;
462         struct rb_node *next_node;
463         struct rb_node *node;
464         u64 last_end;
465         int err;
466         int set = 0;
467         int clear = 0;
468
469         if (delete)
470                 bits |= ~EXTENT_CTLBITS;
471         bits |= EXTENT_FIRST_DELALLOC;
472
473         if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
474                 clear = 1;
475 again:
476         if (!prealloc && (mask & __GFP_WAIT)) {
477                 prealloc = alloc_extent_state(mask);
478                 if (!prealloc)
479                         return -ENOMEM;
480         }
481
482         spin_lock(&tree->lock);
483         if (cached_state) {
484                 cached = *cached_state;
485
486                 if (clear) {
487                         *cached_state = NULL;
488                         cached_state = NULL;
489                 }
490
491                 if (cached && cached->tree && cached->start <= start &&
492                     cached->end > start) {
493                         if (clear)
494                                 atomic_dec(&cached->refs);
495                         state = cached;
496                         goto hit_next;
497                 }
498                 if (clear)
499                         free_extent_state(cached);
500         }
501         /*
502          * this search will find the extents that end after
503          * our range starts
504          */
505         node = tree_search(tree, start);
506         if (!node)
507                 goto out;
508         state = rb_entry(node, struct extent_state, rb_node);
509 hit_next:
510         if (state->start > end)
511                 goto out;
512         WARN_ON(state->end < start);
513         last_end = state->end;
514
515         /*
516          *     | ---- desired range ---- |
517          *  | state | or
518          *  | ------------- state -------------- |
519          *
520          * We need to split the extent we found, and may flip
521          * bits on second half.
522          *
523          * If the extent we found extends past our range, we
524          * just split and search again.  It'll get split again
525          * the next time though.
526          *
527          * If the extent we found is inside our range, we clear
528          * the desired bit on it.
529          */
530
531         if (state->start < start) {
532                 prealloc = alloc_extent_state_atomic(prealloc);
533                 BUG_ON(!prealloc);
534                 err = split_state(tree, state, prealloc, start);
535                 BUG_ON(err == -EEXIST);
536                 prealloc = NULL;
537                 if (err)
538                         goto out;
539                 if (state->end <= end) {
540                         set |= clear_state_bit(tree, state, &bits, wake);
541                         if (last_end == (u64)-1)
542                                 goto out;
543                         start = last_end + 1;
544                 }
545                 goto search_again;
546         }
547         /*
548          * | ---- desired range ---- |
549          *                        | state |
550          * We need to split the extent, and clear the bit
551          * on the first half
552          */
553         if (state->start <= end && state->end > end) {
554                 prealloc = alloc_extent_state_atomic(prealloc);
555                 BUG_ON(!prealloc);
556                 err = split_state(tree, state, prealloc, end + 1);
557                 BUG_ON(err == -EEXIST);
558                 if (wake)
559                         wake_up(&state->wq);
560
561                 set |= clear_state_bit(tree, prealloc, &bits, wake);
562
563                 prealloc = NULL;
564                 goto out;
565         }
566
567         if (state->end < end && prealloc && !need_resched())
568                 next_node = rb_next(&state->rb_node);
569         else
570                 next_node = NULL;
571
572         set |= clear_state_bit(tree, state, &bits, wake);
573         if (last_end == (u64)-1)
574                 goto out;
575         start = last_end + 1;
576         if (start <= end && next_node) {
577                 state = rb_entry(next_node, struct extent_state,
578                                  rb_node);
579                 if (state->start == start)
580                         goto hit_next;
581         }
582         goto search_again;
583
584 out:
585         spin_unlock(&tree->lock);
586         if (prealloc)
587                 free_extent_state(prealloc);
588
589         return set;
590
591 search_again:
592         if (start > end)
593                 goto out;
594         spin_unlock(&tree->lock);
595         if (mask & __GFP_WAIT)
596                 cond_resched();
597         goto again;
598 }
599
600 static int wait_on_state(struct extent_io_tree *tree,
601                          struct extent_state *state)
602                 __releases(tree->lock)
603                 __acquires(tree->lock)
604 {
605         DEFINE_WAIT(wait);
606         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
607         spin_unlock(&tree->lock);
608         schedule();
609         spin_lock(&tree->lock);
610         finish_wait(&state->wq, &wait);
611         return 0;
612 }
613
614 /*
615  * waits for one or more bits to clear on a range in the state tree.
616  * The range [start, end] is inclusive.
617  * The tree lock is taken by this function
618  */
619 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
620 {
621         struct extent_state *state;
622         struct rb_node *node;
623
624         spin_lock(&tree->lock);
625 again:
626         while (1) {
627                 /*
628                  * this search will find all the extents that end after
629                  * our range starts
630                  */
631                 node = tree_search(tree, start);
632                 if (!node)
633                         break;
634
635                 state = rb_entry(node, struct extent_state, rb_node);
636
637                 if (state->start > end)
638                         goto out;
639
640                 if (state->state & bits) {
641                         start = state->start;
642                         atomic_inc(&state->refs);
643                         wait_on_state(tree, state);
644                         free_extent_state(state);
645                         goto again;
646                 }
647                 start = state->end + 1;
648
649                 if (start > end)
650                         break;
651
652                 cond_resched_lock(&tree->lock);
653         }
654 out:
655         spin_unlock(&tree->lock);
656         return 0;
657 }
658
659 static void set_state_bits(struct extent_io_tree *tree,
660                            struct extent_state *state,
661                            int *bits)
662 {
663         int bits_to_set = *bits & ~EXTENT_CTLBITS;
664
665         set_state_cb(tree, state, bits);
666         if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
667                 u64 range = state->end - state->start + 1;
668                 tree->dirty_bytes += range;
669         }
670         state->state |= bits_to_set;
671 }
672
673 static void cache_state(struct extent_state *state,
674                         struct extent_state **cached_ptr)
675 {
676         if (cached_ptr && !(*cached_ptr)) {
677                 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
678                         *cached_ptr = state;
679                         atomic_inc(&state->refs);
680                 }
681         }
682 }
683
684 static void uncache_state(struct extent_state **cached_ptr)
685 {
686         if (cached_ptr && (*cached_ptr)) {
687                 struct extent_state *state = *cached_ptr;
688                 *cached_ptr = NULL;
689                 free_extent_state(state);
690         }
691 }
692
693 /*
694  * set some bits on a range in the tree.  This may require allocations or
695  * sleeping, so the gfp mask is used to indicate what is allowed.
696  *
697  * If any of the exclusive bits are set, this will fail with -EEXIST if some
698  * part of the range already has the desired bits set.  The start of the
699  * existing range is returned in failed_start in this case.
700  *
701  * [start, end] is inclusive This takes the tree lock.
702  */
703
704 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
705                    int bits, int exclusive_bits, u64 *failed_start,
706                    struct extent_state **cached_state, gfp_t mask)
707 {
708         struct extent_state *state;
709         struct extent_state *prealloc = NULL;
710         struct rb_node *node;
711         int err = 0;
712         u64 last_start;
713         u64 last_end;
714
715         bits |= EXTENT_FIRST_DELALLOC;
716 again:
717         if (!prealloc && (mask & __GFP_WAIT)) {
718                 prealloc = alloc_extent_state(mask);
719                 BUG_ON(!prealloc);
720         }
721
722         spin_lock(&tree->lock);
723         if (cached_state && *cached_state) {
724                 state = *cached_state;
725                 if (state->start <= start && state->end > start &&
726                     state->tree) {
727                         node = &state->rb_node;
728                         goto hit_next;
729                 }
730         }
731         /*
732          * this search will find all the extents that end after
733          * our range starts.
734          */
735         node = tree_search(tree, start);
736         if (!node) {
737                 prealloc = alloc_extent_state_atomic(prealloc);
738                 BUG_ON(!prealloc);
739                 err = insert_state(tree, prealloc, start, end, &bits);
740                 prealloc = NULL;
741                 BUG_ON(err == -EEXIST);
742                 goto out;
743         }
744         state = rb_entry(node, struct extent_state, rb_node);
745 hit_next:
746         last_start = state->start;
747         last_end = state->end;
748
749         /*
750          * | ---- desired range ---- |
751          * | state |
752          *
753          * Just lock what we found and keep going
754          */
755         if (state->start == start && state->end <= end) {
756                 struct rb_node *next_node;
757                 if (state->state & exclusive_bits) {
758                         *failed_start = state->start;
759                         err = -EEXIST;
760                         goto out;
761                 }
762
763                 set_state_bits(tree, state, &bits);
764
765                 cache_state(state, cached_state);
766                 merge_state(tree, state);
767                 if (last_end == (u64)-1)
768                         goto out;
769
770                 start = last_end + 1;
771                 next_node = rb_next(&state->rb_node);
772                 if (next_node && start < end && prealloc && !need_resched()) {
773                         state = rb_entry(next_node, struct extent_state,
774                                          rb_node);
775                         if (state->start == start)
776                                 goto hit_next;
777                 }
778                 goto search_again;
779         }
780
781         /*
782          *     | ---- desired range ---- |
783          * | state |
784          *   or
785          * | ------------- state -------------- |
786          *
787          * We need to split the extent we found, and may flip bits on
788          * second half.
789          *
790          * If the extent we found extends past our
791          * range, we just split and search again.  It'll get split
792          * again the next time though.
793          *
794          * If the extent we found is inside our range, we set the
795          * desired bit on it.
796          */
797         if (state->start < start) {
798                 if (state->state & exclusive_bits) {
799                         *failed_start = start;
800                         err = -EEXIST;
801                         goto out;
802                 }
803
804                 prealloc = alloc_extent_state_atomic(prealloc);
805                 BUG_ON(!prealloc);
806                 err = split_state(tree, state, prealloc, start);
807                 BUG_ON(err == -EEXIST);
808                 prealloc = NULL;
809                 if (err)
810                         goto out;
811                 if (state->end <= end) {
812                         set_state_bits(tree, state, &bits);
813                         cache_state(state, cached_state);
814                         merge_state(tree, state);
815                         if (last_end == (u64)-1)
816                                 goto out;
817                         start = last_end + 1;
818                 }
819                 goto search_again;
820         }
821         /*
822          * | ---- desired range ---- |
823          *     | state | or               | state |
824          *
825          * There's a hole, we need to insert something in it and
826          * ignore the extent we found.
827          */
828         if (state->start > start) {
829                 u64 this_end;
830                 if (end < last_start)
831                         this_end = end;
832                 else
833                         this_end = last_start - 1;
834
835                 prealloc = alloc_extent_state_atomic(prealloc);
836                 BUG_ON(!prealloc);
837
838                 /*
839                  * Avoid to free 'prealloc' if it can be merged with
840                  * the later extent.
841                  */
842                 err = insert_state(tree, prealloc, start, this_end,
843                                    &bits);
844                 BUG_ON(err == -EEXIST);
845                 if (err) {
846                         free_extent_state(prealloc);
847                         prealloc = NULL;
848                         goto out;
849                 }
850                 cache_state(prealloc, cached_state);
851                 prealloc = NULL;
852                 start = this_end + 1;
853                 goto search_again;
854         }
855         /*
856          * | ---- desired range ---- |
857          *                        | state |
858          * We need to split the extent, and set the bit
859          * on the first half
860          */
861         if (state->start <= end && state->end > end) {
862                 if (state->state & exclusive_bits) {
863                         *failed_start = start;
864                         err = -EEXIST;
865                         goto out;
866                 }
867
868                 prealloc = alloc_extent_state_atomic(prealloc);
869                 BUG_ON(!prealloc);
870                 err = split_state(tree, state, prealloc, end + 1);
871                 BUG_ON(err == -EEXIST);
872
873                 set_state_bits(tree, prealloc, &bits);
874                 cache_state(prealloc, cached_state);
875                 merge_state(tree, prealloc);
876                 prealloc = NULL;
877                 goto out;
878         }
879
880         goto search_again;
881
882 out:
883         spin_unlock(&tree->lock);
884         if (prealloc)
885                 free_extent_state(prealloc);
886
887         return err;
888
889 search_again:
890         if (start > end)
891                 goto out;
892         spin_unlock(&tree->lock);
893         if (mask & __GFP_WAIT)
894                 cond_resched();
895         goto again;
896 }
897
898 /**
899  * convert_extent - convert all bits in a given range from one bit to another
900  * @tree:       the io tree to search
901  * @start:      the start offset in bytes
902  * @end:        the end offset in bytes (inclusive)
903  * @bits:       the bits to set in this range
904  * @clear_bits: the bits to clear in this range
905  * @mask:       the allocation mask
906  *
907  * This will go through and set bits for the given range.  If any states exist
908  * already in this range they are set with the given bit and cleared of the
909  * clear_bits.  This is only meant to be used by things that are mergeable, ie
910  * converting from say DELALLOC to DIRTY.  This is not meant to be used with
911  * boundary bits like LOCK.
912  */
913 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
914                        int bits, int clear_bits, gfp_t mask)
915 {
916         struct extent_state *state;
917         struct extent_state *prealloc = NULL;
918         struct rb_node *node;
919         int err = 0;
920         u64 last_start;
921         u64 last_end;
922
923 again:
924         if (!prealloc && (mask & __GFP_WAIT)) {
925                 prealloc = alloc_extent_state(mask);
926                 if (!prealloc)
927                         return -ENOMEM;
928         }
929
930         spin_lock(&tree->lock);
931         /*
932          * this search will find all the extents that end after
933          * our range starts.
934          */
935         node = tree_search(tree, start);
936         if (!node) {
937                 prealloc = alloc_extent_state_atomic(prealloc);
938                 if (!prealloc) {
939                         err = -ENOMEM;
940                         goto out;
941                 }
942                 err = insert_state(tree, prealloc, start, end, &bits);
943                 prealloc = NULL;
944                 BUG_ON(err == -EEXIST);
945                 goto out;
946         }
947         state = rb_entry(node, struct extent_state, rb_node);
948 hit_next:
949         last_start = state->start;
950         last_end = state->end;
951
952         /*
953          * | ---- desired range ---- |
954          * | state |
955          *
956          * Just lock what we found and keep going
957          */
958         if (state->start == start && state->end <= end) {
959                 struct rb_node *next_node;
960
961                 set_state_bits(tree, state, &bits);
962                 clear_state_bit(tree, state, &clear_bits, 0);
963
964                 merge_state(tree, state);
965                 if (last_end == (u64)-1)
966                         goto out;
967
968                 start = last_end + 1;
969                 next_node = rb_next(&state->rb_node);
970                 if (next_node && start < end && prealloc && !need_resched()) {
971                         state = rb_entry(next_node, struct extent_state,
972                                          rb_node);
973                         if (state->start == start)
974                                 goto hit_next;
975                 }
976                 goto search_again;
977         }
978
979         /*
980          *     | ---- desired range ---- |
981          * | state |
982          *   or
983          * | ------------- state -------------- |
984          *
985          * We need to split the extent we found, and may flip bits on
986          * second half.
987          *
988          * If the extent we found extends past our
989          * range, we just split and search again.  It'll get split
990          * again the next time though.
991          *
992          * If the extent we found is inside our range, we set the
993          * desired bit on it.
994          */
995         if (state->start < start) {
996                 prealloc = alloc_extent_state_atomic(prealloc);
997                 if (!prealloc) {
998                         err = -ENOMEM;
999                         goto out;
1000                 }
1001                 err = split_state(tree, state, prealloc, start);
1002                 BUG_ON(err == -EEXIST);
1003                 prealloc = NULL;
1004                 if (err)
1005                         goto out;
1006                 if (state->end <= end) {
1007                         set_state_bits(tree, state, &bits);
1008                         clear_state_bit(tree, state, &clear_bits, 0);
1009                         merge_state(tree, state);
1010                         if (last_end == (u64)-1)
1011                                 goto out;
1012                         start = last_end + 1;
1013                 }
1014                 goto search_again;
1015         }
1016         /*
1017          * | ---- desired range ---- |
1018          *     | state | or               | state |
1019          *
1020          * There's a hole, we need to insert something in it and
1021          * ignore the extent we found.
1022          */
1023         if (state->start > start) {
1024                 u64 this_end;
1025                 if (end < last_start)
1026                         this_end = end;
1027                 else
1028                         this_end = last_start - 1;
1029
1030                 prealloc = alloc_extent_state_atomic(prealloc);
1031                 if (!prealloc) {
1032                         err = -ENOMEM;
1033                         goto out;
1034                 }
1035
1036                 /*
1037                  * Avoid to free 'prealloc' if it can be merged with
1038                  * the later extent.
1039                  */
1040                 err = insert_state(tree, prealloc, start, this_end,
1041                                    &bits);
1042                 BUG_ON(err == -EEXIST);
1043                 if (err) {
1044                         free_extent_state(prealloc);
1045                         prealloc = NULL;
1046                         goto out;
1047                 }
1048                 prealloc = NULL;
1049                 start = this_end + 1;
1050                 goto search_again;
1051         }
1052         /*
1053          * | ---- desired range ---- |
1054          *                        | state |
1055          * We need to split the extent, and set the bit
1056          * on the first half
1057          */
1058         if (state->start <= end && state->end > end) {
1059                 prealloc = alloc_extent_state_atomic(prealloc);
1060                 if (!prealloc) {
1061                         err = -ENOMEM;
1062                         goto out;
1063                 }
1064
1065                 err = split_state(tree, state, prealloc, end + 1);
1066                 BUG_ON(err == -EEXIST);
1067
1068                 set_state_bits(tree, prealloc, &bits);
1069                 clear_state_bit(tree, prealloc, &clear_bits, 0);
1070
1071                 merge_state(tree, prealloc);
1072                 prealloc = NULL;
1073                 goto out;
1074         }
1075
1076         goto search_again;
1077
1078 out:
1079         spin_unlock(&tree->lock);
1080         if (prealloc)
1081                 free_extent_state(prealloc);
1082
1083         return err;
1084
1085 search_again:
1086         if (start > end)
1087                 goto out;
1088         spin_unlock(&tree->lock);
1089         if (mask & __GFP_WAIT)
1090                 cond_resched();
1091         goto again;
1092 }
1093
1094 /* wrappers around set/clear extent bit */
1095 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1096                      gfp_t mask)
1097 {
1098         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
1099                               NULL, mask);
1100 }
1101
1102 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1103                     int bits, gfp_t mask)
1104 {
1105         return set_extent_bit(tree, start, end, bits, 0, NULL,
1106                               NULL, mask);
1107 }
1108
1109 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1110                       int bits, gfp_t mask)
1111 {
1112         return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1113 }
1114
1115 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1116                         struct extent_state **cached_state, gfp_t mask)
1117 {
1118         return set_extent_bit(tree, start, end,
1119                               EXTENT_DELALLOC | EXTENT_UPTODATE,
1120                               0, NULL, cached_state, mask);
1121 }
1122
1123 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1124                        gfp_t mask)
1125 {
1126         return clear_extent_bit(tree, start, end,
1127                                 EXTENT_DIRTY | EXTENT_DELALLOC |
1128                                 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1129 }
1130
1131 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1132                      gfp_t mask)
1133 {
1134         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
1135                               NULL, mask);
1136 }
1137
1138 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1139                         struct extent_state **cached_state, gfp_t mask)
1140 {
1141         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
1142                               NULL, cached_state, mask);
1143 }
1144
1145 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
1146                                  u64 end, struct extent_state **cached_state,
1147                                  gfp_t mask)
1148 {
1149         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1150                                 cached_state, mask);
1151 }
1152
1153 /*
1154  * either insert or lock state struct between start and end use mask to tell
1155  * us if waiting is desired.
1156  */
1157 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1158                      int bits, struct extent_state **cached_state, gfp_t mask)
1159 {
1160         int err;
1161         u64 failed_start;
1162         while (1) {
1163                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1164                                      EXTENT_LOCKED, &failed_start,
1165                                      cached_state, mask);
1166                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
1167                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1168                         start = failed_start;
1169                 } else {
1170                         break;
1171                 }
1172                 WARN_ON(start > end);
1173         }
1174         return err;
1175 }
1176
1177 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1178 {
1179         return lock_extent_bits(tree, start, end, 0, NULL, mask);
1180 }
1181
1182 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1183                     gfp_t mask)
1184 {
1185         int err;
1186         u64 failed_start;
1187
1188         err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1189                              &failed_start, NULL, mask);
1190         if (err == -EEXIST) {
1191                 if (failed_start > start)
1192                         clear_extent_bit(tree, start, failed_start - 1,
1193                                          EXTENT_LOCKED, 1, 0, NULL, mask);
1194                 return 0;
1195         }
1196         return 1;
1197 }
1198
1199 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1200                          struct extent_state **cached, gfp_t mask)
1201 {
1202         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1203                                 mask);
1204 }
1205
1206 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1207 {
1208         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1209                                 mask);
1210 }
1211
1212 int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1213 {
1214         unsigned long index = start >> PAGE_CACHE_SHIFT;
1215         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1216         struct page *page;
1217
1218         while (index <= end_index) {
1219                 page = find_get_page(inode->i_mapping, index);
1220                 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1221                 clear_page_dirty_for_io(page);
1222                 page_cache_release(page);
1223                 index++;
1224         }
1225         return 0;
1226 }
1227
1228 int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1229 {
1230         unsigned long index = start >> PAGE_CACHE_SHIFT;
1231         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1232         struct page *page;
1233
1234         while (index <= end_index) {
1235                 page = find_get_page(inode->i_mapping, index);
1236                 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1237                 account_page_redirty(page);
1238                 __set_page_dirty_nobuffers(page);
1239                 page_cache_release(page);
1240                 index++;
1241         }
1242         return 0;
1243 }
1244
1245 /*
1246  * helper function to set both pages and extents in the tree writeback
1247  */
1248 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1249 {
1250         unsigned long index = start >> PAGE_CACHE_SHIFT;
1251         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1252         struct page *page;
1253
1254         while (index <= end_index) {
1255                 page = find_get_page(tree->mapping, index);
1256                 BUG_ON(!page);
1257                 set_page_writeback(page);
1258                 page_cache_release(page);
1259                 index++;
1260         }
1261         return 0;
1262 }
1263
1264 /* find the first state struct with 'bits' set after 'start', and
1265  * return it.  tree->lock must be held.  NULL will returned if
1266  * nothing was found after 'start'
1267  */
1268 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1269                                                  u64 start, int bits)
1270 {
1271         struct rb_node *node;
1272         struct extent_state *state;
1273
1274         /*
1275          * this search will find all the extents that end after
1276          * our range starts.
1277          */
1278         node = tree_search(tree, start);
1279         if (!node)
1280                 goto out;
1281
1282         while (1) {
1283                 state = rb_entry(node, struct extent_state, rb_node);
1284                 if (state->end >= start && (state->state & bits))
1285                         return state;
1286
1287                 node = rb_next(node);
1288                 if (!node)
1289                         break;
1290         }
1291 out:
1292         return NULL;
1293 }
1294
1295 /*
1296  * find the first offset in the io tree with 'bits' set. zero is
1297  * returned if we find something, and *start_ret and *end_ret are
1298  * set to reflect the state struct that was found.
1299  *
1300  * If nothing was found, 1 is returned, < 0 on error
1301  */
1302 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1303                           u64 *start_ret, u64 *end_ret, int bits)
1304 {
1305         struct extent_state *state;
1306         int ret = 1;
1307
1308         spin_lock(&tree->lock);
1309         state = find_first_extent_bit_state(tree, start, bits);
1310         if (state) {
1311                 *start_ret = state->start;
1312                 *end_ret = state->end;
1313                 ret = 0;
1314         }
1315         spin_unlock(&tree->lock);
1316         return ret;
1317 }
1318
1319 /*
1320  * find a contiguous range of bytes in the file marked as delalloc, not
1321  * more than 'max_bytes'.  start and end are used to return the range,
1322  *
1323  * 1 is returned if we find something, 0 if nothing was in the tree
1324  */
1325 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1326                                         u64 *start, u64 *end, u64 max_bytes,
1327                                         struct extent_state **cached_state)
1328 {
1329         struct rb_node *node;
1330         struct extent_state *state;
1331         u64 cur_start = *start;
1332         u64 found = 0;
1333         u64 total_bytes = 0;
1334
1335         spin_lock(&tree->lock);
1336
1337         /*
1338          * this search will find all the extents that end after
1339          * our range starts.
1340          */
1341         node = tree_search(tree, cur_start);
1342         if (!node) {
1343                 if (!found)
1344                         *end = (u64)-1;
1345                 goto out;
1346         }
1347
1348         while (1) {
1349                 state = rb_entry(node, struct extent_state, rb_node);
1350                 if (found && (state->start != cur_start ||
1351                               (state->state & EXTENT_BOUNDARY))) {
1352                         goto out;
1353                 }
1354                 if (!(state->state & EXTENT_DELALLOC)) {
1355                         if (!found)
1356                                 *end = state->end;
1357                         goto out;
1358                 }
1359                 if (!found) {
1360                         *start = state->start;
1361                         *cached_state = state;
1362                         atomic_inc(&state->refs);
1363                 }
1364                 found++;
1365                 *end = state->end;
1366                 cur_start = state->end + 1;
1367                 node = rb_next(node);
1368                 if (!node)
1369                         break;
1370                 total_bytes += state->end - state->start + 1;
1371                 if (total_bytes >= max_bytes)
1372                         break;
1373         }
1374 out:
1375         spin_unlock(&tree->lock);
1376         return found;
1377 }
1378
1379 static noinline int __unlock_for_delalloc(struct inode *inode,
1380                                           struct page *locked_page,
1381                                           u64 start, u64 end)
1382 {
1383         int ret;
1384         struct page *pages[16];
1385         unsigned long index = start >> PAGE_CACHE_SHIFT;
1386         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1387         unsigned long nr_pages = end_index - index + 1;
1388         int i;
1389
1390         if (index == locked_page->index && end_index == index)
1391                 return 0;
1392
1393         while (nr_pages > 0) {
1394                 ret = find_get_pages_contig(inode->i_mapping, index,
1395                                      min_t(unsigned long, nr_pages,
1396                                      ARRAY_SIZE(pages)), pages);
1397                 for (i = 0; i < ret; i++) {
1398                         if (pages[i] != locked_page)
1399                                 unlock_page(pages[i]);
1400                         page_cache_release(pages[i]);
1401                 }
1402                 nr_pages -= ret;
1403                 index += ret;
1404                 cond_resched();
1405         }
1406         return 0;
1407 }
1408
1409 static noinline int lock_delalloc_pages(struct inode *inode,
1410                                         struct page *locked_page,
1411                                         u64 delalloc_start,
1412                                         u64 delalloc_end)
1413 {
1414         unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1415         unsigned long start_index = index;
1416         unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1417         unsigned long pages_locked = 0;
1418         struct page *pages[16];
1419         unsigned long nrpages;
1420         int ret;
1421         int i;
1422
1423         /* the caller is responsible for locking the start index */
1424         if (index == locked_page->index && index == end_index)
1425                 return 0;
1426
1427         /* skip the page at the start index */
1428         nrpages = end_index - index + 1;
1429         while (nrpages > 0) {
1430                 ret = find_get_pages_contig(inode->i_mapping, index,
1431                                      min_t(unsigned long,
1432                                      nrpages, ARRAY_SIZE(pages)), pages);
1433                 if (ret == 0) {
1434                         ret = -EAGAIN;
1435                         goto done;
1436                 }
1437                 /* now we have an array of pages, lock them all */
1438                 for (i = 0; i < ret; i++) {
1439                         /*
1440                          * the caller is taking responsibility for
1441                          * locked_page
1442                          */
1443                         if (pages[i] != locked_page) {
1444                                 lock_page(pages[i]);
1445                                 if (!PageDirty(pages[i]) ||
1446                                     pages[i]->mapping != inode->i_mapping) {
1447                                         ret = -EAGAIN;
1448                                         unlock_page(pages[i]);
1449                                         page_cache_release(pages[i]);
1450                                         goto done;
1451                                 }
1452                         }
1453                         page_cache_release(pages[i]);
1454                         pages_locked++;
1455                 }
1456                 nrpages -= ret;
1457                 index += ret;
1458                 cond_resched();
1459         }
1460         ret = 0;
1461 done:
1462         if (ret && pages_locked) {
1463                 __unlock_for_delalloc(inode, locked_page,
1464                               delalloc_start,
1465                               ((u64)(start_index + pages_locked - 1)) <<
1466                               PAGE_CACHE_SHIFT);
1467         }
1468         return ret;
1469 }
1470
1471 /*
1472  * find a contiguous range of bytes in the file marked as delalloc, not
1473  * more than 'max_bytes'.  start and end are used to return the range,
1474  *
1475  * 1 is returned if we find something, 0 if nothing was in the tree
1476  */
1477 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1478                                              struct extent_io_tree *tree,
1479                                              struct page *locked_page,
1480                                              u64 *start, u64 *end,
1481                                              u64 max_bytes)
1482 {
1483         u64 delalloc_start;
1484         u64 delalloc_end;
1485         u64 found;
1486         struct extent_state *cached_state = NULL;
1487         int ret;
1488         int loops = 0;
1489
1490 again:
1491         /* step one, find a bunch of delalloc bytes starting at start */
1492         delalloc_start = *start;
1493         delalloc_end = 0;
1494         found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1495                                     max_bytes, &cached_state);
1496         if (!found || delalloc_end <= *start) {
1497                 *start = delalloc_start;
1498                 *end = delalloc_end;
1499                 free_extent_state(cached_state);
1500                 return found;
1501         }
1502
1503         /*
1504          * start comes from the offset of locked_page.  We have to lock
1505          * pages in order, so we can't process delalloc bytes before
1506          * locked_page
1507          */
1508         if (delalloc_start < *start)
1509                 delalloc_start = *start;
1510
1511         /*
1512          * make sure to limit the number of pages we try to lock down
1513          * if we're looping.
1514          */
1515         if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1516                 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1517
1518         /* step two, lock all the pages after the page that has start */
1519         ret = lock_delalloc_pages(inode, locked_page,
1520                                   delalloc_start, delalloc_end);
1521         if (ret == -EAGAIN) {
1522                 /* some of the pages are gone, lets avoid looping by
1523                  * shortening the size of the delalloc range we're searching
1524                  */
1525                 free_extent_state(cached_state);
1526                 if (!loops) {
1527                         unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1528                         max_bytes = PAGE_CACHE_SIZE - offset;
1529                         loops = 1;
1530                         goto again;
1531                 } else {
1532                         found = 0;
1533                         goto out_failed;
1534                 }
1535         }
1536         BUG_ON(ret);
1537
1538         /* step three, lock the state bits for the whole range */
1539         lock_extent_bits(tree, delalloc_start, delalloc_end,
1540                          0, &cached_state, GFP_NOFS);
1541
1542         /* then test to make sure it is all still delalloc */
1543         ret = test_range_bit(tree, delalloc_start, delalloc_end,
1544                              EXTENT_DELALLOC, 1, cached_state);
1545         if (!ret) {
1546                 unlock_extent_cached(tree, delalloc_start, delalloc_end,
1547                                      &cached_state, GFP_NOFS);
1548                 __unlock_for_delalloc(inode, locked_page,
1549                               delalloc_start, delalloc_end);
1550                 cond_resched();
1551                 goto again;
1552         }
1553         free_extent_state(cached_state);
1554         *start = delalloc_start;
1555         *end = delalloc_end;
1556 out_failed:
1557         return found;
1558 }
1559
1560 int extent_clear_unlock_delalloc(struct inode *inode,
1561                                 struct extent_io_tree *tree,
1562                                 u64 start, u64 end, struct page *locked_page,
1563                                 unsigned long op)
1564 {
1565         int ret;
1566         struct page *pages[16];
1567         unsigned long index = start >> PAGE_CACHE_SHIFT;
1568         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1569         unsigned long nr_pages = end_index - index + 1;
1570         int i;
1571         int clear_bits = 0;
1572
1573         if (op & EXTENT_CLEAR_UNLOCK)
1574                 clear_bits |= EXTENT_LOCKED;
1575         if (op & EXTENT_CLEAR_DIRTY)
1576                 clear_bits |= EXTENT_DIRTY;
1577
1578         if (op & EXTENT_CLEAR_DELALLOC)
1579                 clear_bits |= EXTENT_DELALLOC;
1580
1581         clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1582         if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1583                     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1584                     EXTENT_SET_PRIVATE2)))
1585                 return 0;
1586
1587         while (nr_pages > 0) {
1588                 ret = find_get_pages_contig(inode->i_mapping, index,
1589                                      min_t(unsigned long,
1590                                      nr_pages, ARRAY_SIZE(pages)), pages);
1591                 for (i = 0; i < ret; i++) {
1592
1593                         if (op & EXTENT_SET_PRIVATE2)
1594                                 SetPagePrivate2(pages[i]);
1595
1596                         if (pages[i] == locked_page) {
1597                                 page_cache_release(pages[i]);
1598                                 continue;
1599                         }
1600                         if (op & EXTENT_CLEAR_DIRTY)
1601                                 clear_page_dirty_for_io(pages[i]);
1602                         if (op & EXTENT_SET_WRITEBACK)
1603                                 set_page_writeback(pages[i]);
1604                         if (op & EXTENT_END_WRITEBACK)
1605                                 end_page_writeback(pages[i]);
1606                         if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1607                                 unlock_page(pages[i]);
1608                         page_cache_release(pages[i]);
1609                 }
1610                 nr_pages -= ret;
1611                 index += ret;
1612                 cond_resched();
1613         }
1614         return 0;
1615 }
1616
1617 /*
1618  * count the number of bytes in the tree that have a given bit(s)
1619  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1620  * cached.  The total number found is returned.
1621  */
1622 u64 count_range_bits(struct extent_io_tree *tree,
1623                      u64 *start, u64 search_end, u64 max_bytes,
1624                      unsigned long bits, int contig)
1625 {
1626         struct rb_node *node;
1627         struct extent_state *state;
1628         u64 cur_start = *start;
1629         u64 total_bytes = 0;
1630         u64 last = 0;
1631         int found = 0;
1632
1633         if (search_end <= cur_start) {
1634                 WARN_ON(1);
1635                 return 0;
1636         }
1637
1638         spin_lock(&tree->lock);
1639         if (cur_start == 0 && bits == EXTENT_DIRTY) {
1640                 total_bytes = tree->dirty_bytes;
1641                 goto out;
1642         }
1643         /*
1644          * this search will find all the extents that end after
1645          * our range starts.
1646          */
1647         node = tree_search(tree, cur_start);
1648         if (!node)
1649                 goto out;
1650
1651         while (1) {
1652                 state = rb_entry(node, struct extent_state, rb_node);
1653                 if (state->start > search_end)
1654                         break;
1655                 if (contig && found && state->start > last + 1)
1656                         break;
1657                 if (state->end >= cur_start && (state->state & bits) == bits) {
1658                         total_bytes += min(search_end, state->end) + 1 -
1659                                        max(cur_start, state->start);
1660                         if (total_bytes >= max_bytes)
1661                                 break;
1662                         if (!found) {
1663                                 *start = max(cur_start, state->start);
1664                                 found = 1;
1665                         }
1666                         last = state->end;
1667                 } else if (contig && found) {
1668                         break;
1669                 }
1670                 node = rb_next(node);
1671                 if (!node)
1672                         break;
1673         }
1674 out:
1675         spin_unlock(&tree->lock);
1676         return total_bytes;
1677 }
1678
1679 /*
1680  * set the private field for a given byte offset in the tree.  If there isn't
1681  * an extent_state there already, this does nothing.
1682  */
1683 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1684 {
1685         struct rb_node *node;
1686         struct extent_state *state;
1687         int ret = 0;
1688
1689         spin_lock(&tree->lock);
1690         /*
1691          * this search will find all the extents that end after
1692          * our range starts.
1693          */
1694         node = tree_search(tree, start);
1695         if (!node) {
1696                 ret = -ENOENT;
1697                 goto out;
1698         }
1699         state = rb_entry(node, struct extent_state, rb_node);
1700         if (state->start != start) {
1701                 ret = -ENOENT;
1702                 goto out;
1703         }
1704         state->private = private;
1705 out:
1706         spin_unlock(&tree->lock);
1707         return ret;
1708 }
1709
1710 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1711 {
1712         struct rb_node *node;
1713         struct extent_state *state;
1714         int ret = 0;
1715
1716         spin_lock(&tree->lock);
1717         /*
1718          * this search will find all the extents that end after
1719          * our range starts.
1720          */
1721         node = tree_search(tree, start);
1722         if (!node) {
1723                 ret = -ENOENT;
1724                 goto out;
1725         }
1726         state = rb_entry(node, struct extent_state, rb_node);
1727         if (state->start != start) {
1728                 ret = -ENOENT;
1729                 goto out;
1730         }
1731         *private = state->private;
1732 out:
1733         spin_unlock(&tree->lock);
1734         return ret;
1735 }
1736
1737 /*
1738  * searches a range in the state tree for a given mask.
1739  * If 'filled' == 1, this returns 1 only if every extent in the tree
1740  * has the bits set.  Otherwise, 1 is returned if any bit in the
1741  * range is found set.
1742  */
1743 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1744                    int bits, int filled, struct extent_state *cached)
1745 {
1746         struct extent_state *state = NULL;
1747         struct rb_node *node;
1748         int bitset = 0;
1749
1750         spin_lock(&tree->lock);
1751         if (cached && cached->tree && cached->start <= start &&
1752             cached->end > start)
1753                 node = &cached->rb_node;
1754         else
1755                 node = tree_search(tree, start);
1756         while (node && start <= end) {
1757                 state = rb_entry(node, struct extent_state, rb_node);
1758
1759                 if (filled && state->start > start) {
1760                         bitset = 0;
1761                         break;
1762                 }
1763
1764                 if (state->start > end)
1765                         break;
1766
1767                 if (state->state & bits) {
1768                         bitset = 1;
1769                         if (!filled)
1770                                 break;
1771                 } else if (filled) {
1772                         bitset = 0;
1773                         break;
1774                 }
1775
1776                 if (state->end == (u64)-1)
1777                         break;
1778
1779                 start = state->end + 1;
1780                 if (start > end)
1781                         break;
1782                 node = rb_next(node);
1783                 if (!node) {
1784                         if (filled)
1785                                 bitset = 0;
1786                         break;
1787                 }
1788         }
1789         spin_unlock(&tree->lock);
1790         return bitset;
1791 }
1792
1793 /*
1794  * helper function to set a given page up to date if all the
1795  * extents in the tree for that page are up to date
1796  */
1797 static int check_page_uptodate(struct extent_io_tree *tree,
1798                                struct page *page)
1799 {
1800         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1801         u64 end = start + PAGE_CACHE_SIZE - 1;
1802         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1803                 SetPageUptodate(page);
1804         return 0;
1805 }
1806
1807 /*
1808  * helper function to unlock a page if all the extents in the tree
1809  * for that page are unlocked
1810  */
1811 static int check_page_locked(struct extent_io_tree *tree,
1812                              struct page *page)
1813 {
1814         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1815         u64 end = start + PAGE_CACHE_SIZE - 1;
1816         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1817                 unlock_page(page);
1818         return 0;
1819 }
1820
1821 /*
1822  * helper function to end page writeback if all the extents
1823  * in the tree for that page are done with writeback
1824  */
1825 static int check_page_writeback(struct extent_io_tree *tree,
1826                              struct page *page)
1827 {
1828         end_page_writeback(page);
1829         return 0;
1830 }
1831
1832 /*
1833  * When IO fails, either with EIO or csum verification fails, we
1834  * try other mirrors that might have a good copy of the data.  This
1835  * io_failure_record is used to record state as we go through all the
1836  * mirrors.  If another mirror has good data, the page is set up to date
1837  * and things continue.  If a good mirror can't be found, the original
1838  * bio end_io callback is called to indicate things have failed.
1839  */
1840 struct io_failure_record {
1841         struct page *page;
1842         u64 start;
1843         u64 len;
1844         u64 logical;
1845         unsigned long bio_flags;
1846         int this_mirror;
1847         int failed_mirror;
1848         int in_validation;
1849 };
1850
1851 static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1852                                 int did_repair)
1853 {
1854         int ret;
1855         int err = 0;
1856         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1857
1858         set_state_private(failure_tree, rec->start, 0);
1859         ret = clear_extent_bits(failure_tree, rec->start,
1860                                 rec->start + rec->len - 1,
1861                                 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1862         if (ret)
1863                 err = ret;
1864
1865         if (did_repair) {
1866                 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1867                                         rec->start + rec->len - 1,
1868                                         EXTENT_DAMAGED, GFP_NOFS);
1869                 if (ret && !err)
1870                         err = ret;
1871         }
1872
1873         kfree(rec);
1874         return err;
1875 }
1876
1877 static void repair_io_failure_callback(struct bio *bio, int err)
1878 {
1879         complete(bio->bi_private);
1880 }
1881
1882 /*
1883  * this bypasses the standard btrfs submit functions deliberately, as
1884  * the standard behavior is to write all copies in a raid setup. here we only
1885  * want to write the one bad copy. so we do the mapping for ourselves and issue
1886  * submit_bio directly.
1887  * to avoid any synchonization issues, wait for the data after writing, which
1888  * actually prevents the read that triggered the error from finishing.
1889  * currently, there can be no more than two copies of every data bit. thus,
1890  * exactly one rewrite is required.
1891  */
1892 int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1893                         u64 length, u64 logical, struct page *page,
1894                         int mirror_num)
1895 {
1896         struct bio *bio;
1897         struct btrfs_device *dev;
1898         DECLARE_COMPLETION_ONSTACK(compl);
1899         u64 map_length = 0;
1900         u64 sector;
1901         struct btrfs_bio *bbio = NULL;
1902         int ret;
1903
1904         BUG_ON(!mirror_num);
1905
1906         bio = bio_alloc(GFP_NOFS, 1);
1907         if (!bio)
1908                 return -EIO;
1909         bio->bi_private = &compl;
1910         bio->bi_end_io = repair_io_failure_callback;
1911         bio->bi_size = 0;
1912         map_length = length;
1913
1914         ret = btrfs_map_block(map_tree, WRITE, logical,
1915                               &map_length, &bbio, mirror_num);
1916         if (ret) {
1917                 bio_put(bio);
1918                 return -EIO;
1919         }
1920         BUG_ON(mirror_num != bbio->mirror_num);
1921         sector = bbio->stripes[mirror_num-1].physical >> 9;
1922         bio->bi_sector = sector;
1923         dev = bbio->stripes[mirror_num-1].dev;
1924         kfree(bbio);
1925         if (!dev || !dev->bdev || !dev->writeable) {
1926                 bio_put(bio);
1927                 return -EIO;
1928         }
1929         bio->bi_bdev = dev->bdev;
1930         bio_add_page(bio, page, length, start-page_offset(page));
1931         submit_bio(WRITE_SYNC, bio);
1932         wait_for_completion(&compl);
1933
1934         if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1935                 /* try to remap that extent elsewhere? */
1936                 bio_put(bio);
1937                 return -EIO;
1938         }
1939
1940         printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1941                         "sector %llu)\n", page->mapping->host->i_ino, start,
1942                         dev->name, sector);
1943
1944         bio_put(bio);
1945         return 0;
1946 }
1947
1948 /*
1949  * each time an IO finishes, we do a fast check in the IO failure tree
1950  * to see if we need to process or clean up an io_failure_record
1951  */
1952 static int clean_io_failure(u64 start, struct page *page)
1953 {
1954         u64 private;
1955         u64 private_failure;
1956         struct io_failure_record *failrec;
1957         struct btrfs_mapping_tree *map_tree;
1958         struct extent_state *state;
1959         int num_copies;
1960         int did_repair = 0;
1961         int ret;
1962         struct inode *inode = page->mapping->host;
1963
1964         private = 0;
1965         ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1966                                 (u64)-1, 1, EXTENT_DIRTY, 0);
1967         if (!ret)
1968                 return 0;
1969
1970         ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1971                                 &private_failure);
1972         if (ret)
1973                 return 0;
1974
1975         failrec = (struct io_failure_record *)(unsigned long) private_failure;
1976         BUG_ON(!failrec->this_mirror);
1977
1978         if (failrec->in_validation) {
1979                 /* there was no real error, just free the record */
1980                 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1981                          failrec->start);
1982                 did_repair = 1;
1983                 goto out;
1984         }
1985
1986         spin_lock(&BTRFS_I(inode)->io_tree.lock);
1987         state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1988                                             failrec->start,
1989                                             EXTENT_LOCKED);
1990         spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1991
1992         if (state && state->start == failrec->start) {
1993                 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1994                 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1995                                                 failrec->len);
1996                 if (num_copies > 1)  {
1997                         ret = repair_io_failure(map_tree, start, failrec->len,
1998                                                 failrec->logical, page,
1999                                                 failrec->failed_mirror);
2000                         did_repair = !ret;
2001                 }
2002         }
2003
2004 out:
2005         if (!ret)
2006                 ret = free_io_failure(inode, failrec, did_repair);
2007
2008         return ret;
2009 }
2010
2011 /*
2012  * this is a generic handler for readpage errors (default
2013  * readpage_io_failed_hook). if other copies exist, read those and write back
2014  * good data to the failed position. does not investigate in remapping the
2015  * failed extent elsewhere, hoping the device will be smart enough to do this as
2016  * needed
2017  */
2018
2019 static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2020                                 u64 start, u64 end, int failed_mirror,
2021                                 struct extent_state *state)
2022 {
2023         struct io_failure_record *failrec = NULL;
2024         u64 private;
2025         struct extent_map *em;
2026         struct inode *inode = page->mapping->host;
2027         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2028         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2029         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2030         struct bio *bio;
2031         int num_copies;
2032         int ret;
2033         int read_mode;
2034         u64 logical;
2035
2036         BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2037
2038         ret = get_state_private(failure_tree, start, &private);
2039         if (ret) {
2040                 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2041                 if (!failrec)
2042                         return -ENOMEM;
2043                 failrec->start = start;
2044                 failrec->len = end - start + 1;
2045                 failrec->this_mirror = 0;
2046                 failrec->bio_flags = 0;
2047                 failrec->in_validation = 0;
2048
2049                 read_lock(&em_tree->lock);
2050                 em = lookup_extent_mapping(em_tree, start, failrec->len);
2051                 if (!em) {
2052                         read_unlock(&em_tree->lock);
2053                         kfree(failrec);
2054                         return -EIO;
2055                 }
2056
2057                 if (em->start > start || em->start + em->len < start) {
2058                         free_extent_map(em);
2059                         em = NULL;
2060                 }
2061                 read_unlock(&em_tree->lock);
2062
2063                 if (!em || IS_ERR(em)) {
2064                         kfree(failrec);
2065                         return -EIO;
2066                 }
2067                 logical = start - em->start;
2068                 logical = em->block_start + logical;
2069                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2070                         logical = em->block_start;
2071                         failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2072                         extent_set_compress_type(&failrec->bio_flags,
2073                                                  em->compress_type);
2074                 }
2075                 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2076                          "len=%llu\n", logical, start, failrec->len);
2077                 failrec->logical = logical;
2078                 free_extent_map(em);
2079
2080                 /* set the bits in the private failure tree */
2081                 ret = set_extent_bits(failure_tree, start, end,
2082                                         EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2083                 if (ret >= 0)
2084                         ret = set_state_private(failure_tree, start,
2085                                                 (u64)(unsigned long)failrec);
2086                 /* set the bits in the inode's tree */
2087                 if (ret >= 0)
2088                         ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2089                                                 GFP_NOFS);
2090                 if (ret < 0) {
2091                         kfree(failrec);
2092                         return ret;
2093                 }
2094         } else {
2095                 failrec = (struct io_failure_record *)(unsigned long)private;
2096                 pr_debug("bio_readpage_error: (found) logical=%llu, "
2097                          "start=%llu, len=%llu, validation=%d\n",
2098                          failrec->logical, failrec->start, failrec->len,
2099                          failrec->in_validation);
2100                 /*
2101                  * when data can be on disk more than twice, add to failrec here
2102                  * (e.g. with a list for failed_mirror) to make
2103                  * clean_io_failure() clean all those errors at once.
2104                  */
2105         }
2106         num_copies = btrfs_num_copies(
2107                               &BTRFS_I(inode)->root->fs_info->mapping_tree,
2108                               failrec->logical, failrec->len);
2109         if (num_copies == 1) {
2110                 /*
2111                  * we only have a single copy of the data, so don't bother with
2112                  * all the retry and error correction code that follows. no
2113                  * matter what the error is, it is very likely to persist.
2114                  */
2115                 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2116                          "state=%p, num_copies=%d, next_mirror %d, "
2117                          "failed_mirror %d\n", state, num_copies,
2118                          failrec->this_mirror, failed_mirror);
2119                 free_io_failure(inode, failrec, 0);
2120                 return -EIO;
2121         }
2122
2123         if (!state) {
2124                 spin_lock(&tree->lock);
2125                 state = find_first_extent_bit_state(tree, failrec->start,
2126                                                     EXTENT_LOCKED);
2127                 if (state && state->start != failrec->start)
2128                         state = NULL;
2129                 spin_unlock(&tree->lock);
2130         }
2131
2132         /*
2133          * there are two premises:
2134          *      a) deliver good data to the caller
2135          *      b) correct the bad sectors on disk
2136          */
2137         if (failed_bio->bi_vcnt > 1) {
2138                 /*
2139                  * to fulfill b), we need to know the exact failing sectors, as
2140                  * we don't want to rewrite any more than the failed ones. thus,
2141                  * we need separate read requests for the failed bio
2142                  *
2143                  * if the following BUG_ON triggers, our validation request got
2144                  * merged. we need separate requests for our algorithm to work.
2145                  */
2146                 BUG_ON(failrec->in_validation);
2147                 failrec->in_validation = 1;
2148                 failrec->this_mirror = failed_mirror;
2149                 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2150         } else {
2151                 /*
2152                  * we're ready to fulfill a) and b) alongside. get a good copy
2153                  * of the failed sector and if we succeed, we have setup
2154                  * everything for repair_io_failure to do the rest for us.
2155                  */
2156                 if (failrec->in_validation) {
2157                         BUG_ON(failrec->this_mirror != failed_mirror);
2158                         failrec->in_validation = 0;
2159                         failrec->this_mirror = 0;
2160                 }
2161                 failrec->failed_mirror = failed_mirror;
2162                 failrec->this_mirror++;
2163                 if (failrec->this_mirror == failed_mirror)
2164                         failrec->this_mirror++;
2165                 read_mode = READ_SYNC;
2166         }
2167
2168         if (!state || failrec->this_mirror > num_copies) {
2169                 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2170                          "next_mirror %d, failed_mirror %d\n", state,
2171                          num_copies, failrec->this_mirror, failed_mirror);
2172                 free_io_failure(inode, failrec, 0);
2173                 return -EIO;
2174         }
2175
2176         bio = bio_alloc(GFP_NOFS, 1);
2177         bio->bi_private = state;
2178         bio->bi_end_io = failed_bio->bi_end_io;
2179         bio->bi_sector = failrec->logical >> 9;
2180         bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2181         bio->bi_size = 0;
2182
2183         bio_add_page(bio, page, failrec->len, start - page_offset(page));
2184
2185         pr_debug("bio_readpage_error: submitting new read[%#x] to "
2186                  "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2187                  failrec->this_mirror, num_copies, failrec->in_validation);
2188
2189         tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2190                                         failrec->bio_flags, 0);
2191         return 0;
2192 }
2193
2194 /* lots and lots of room for performance fixes in the end_bio funcs */
2195
2196 /*
2197  * after a writepage IO is done, we need to:
2198  * clear the uptodate bits on error
2199  * clear the writeback bits in the extent tree for this IO
2200  * end_page_writeback if the page has no more pending IO
2201  *
2202  * Scheduling is not allowed, so the extent state tree is expected
2203  * to have one and only one object corresponding to this IO.
2204  */
2205 static void end_bio_extent_writepage(struct bio *bio, int err)
2206 {
2207         int uptodate = err == 0;
2208         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2209         struct extent_io_tree *tree;
2210         u64 start;
2211         u64 end;
2212         int whole_page;
2213         int ret;
2214
2215         do {
2216                 struct page *page = bvec->bv_page;
2217                 tree = &BTRFS_I(page->mapping->host)->io_tree;
2218
2219                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2220                          bvec->bv_offset;
2221                 end = start + bvec->bv_len - 1;
2222
2223                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2224                         whole_page = 1;
2225                 else
2226                         whole_page = 0;
2227
2228                 if (--bvec >= bio->bi_io_vec)
2229                         prefetchw(&bvec->bv_page->flags);
2230                 if (tree->ops && tree->ops->writepage_end_io_hook) {
2231                         ret = tree->ops->writepage_end_io_hook(page, start,
2232                                                        end, NULL, uptodate);
2233                         if (ret)
2234                                 uptodate = 0;
2235                 }
2236
2237                 if (!uptodate && tree->ops &&
2238                     tree->ops->writepage_io_failed_hook) {
2239                         ret = tree->ops->writepage_io_failed_hook(bio, page,
2240                                                          start, end, NULL);
2241                         if (ret == 0) {
2242                                 uptodate = (err == 0);
2243                                 continue;
2244                         }
2245                 }
2246
2247                 if (!uptodate) {
2248                         clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
2249                         ClearPageUptodate(page);
2250                         SetPageError(page);
2251                 }
2252
2253                 if (whole_page)
2254                         end_page_writeback(page);
2255                 else
2256                         check_page_writeback(tree, page);
2257         } while (bvec >= bio->bi_io_vec);
2258
2259         bio_put(bio);
2260 }
2261
2262 /*
2263  * after a readpage IO is done, we need to:
2264  * clear the uptodate bits on error
2265  * set the uptodate bits if things worked
2266  * set the page up to date if all extents in the tree are uptodate
2267  * clear the lock bit in the extent tree
2268  * unlock the page if there are no other extents locked for it
2269  *
2270  * Scheduling is not allowed, so the extent state tree is expected
2271  * to have one and only one object corresponding to this IO.
2272  */
2273 static void end_bio_extent_readpage(struct bio *bio, int err)
2274 {
2275         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2276         struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
2277         struct bio_vec *bvec = bio->bi_io_vec;
2278         struct extent_io_tree *tree;
2279         u64 start;
2280         u64 end;
2281         int whole_page;
2282         int ret;
2283
2284         if (err)
2285                 uptodate = 0;
2286
2287         do {
2288                 struct page *page = bvec->bv_page;
2289                 struct extent_state *cached = NULL;
2290                 struct extent_state *state;
2291
2292                 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2293                          "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2294                          (long int)bio->bi_bdev);
2295                 tree = &BTRFS_I(page->mapping->host)->io_tree;
2296
2297                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2298                         bvec->bv_offset;
2299                 end = start + bvec->bv_len - 1;
2300
2301                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2302                         whole_page = 1;
2303                 else
2304                         whole_page = 0;
2305
2306                 if (++bvec <= bvec_end)
2307                         prefetchw(&bvec->bv_page->flags);
2308
2309                 spin_lock(&tree->lock);
2310                 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
2311                 if (state && state->start == start) {
2312                         /*
2313                          * take a reference on the state, unlock will drop
2314                          * the ref
2315                          */
2316                         cache_state(state, &cached);
2317                 }
2318                 spin_unlock(&tree->lock);
2319
2320                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2321                         ret = tree->ops->readpage_end_io_hook(page, start, end,
2322                                                               state);
2323                         if (ret)
2324                                 uptodate = 0;
2325                         else
2326                                 clean_io_failure(start, page);
2327                 }
2328                 if (!uptodate) {
2329                         int failed_mirror;
2330                         failed_mirror = (int)(unsigned long)bio->bi_bdev;
2331                         /*
2332                          * The generic bio_readpage_error handles errors the
2333                          * following way: If possible, new read requests are
2334                          * created and submitted and will end up in
2335                          * end_bio_extent_readpage as well (if we're lucky, not
2336                          * in the !uptodate case). In that case it returns 0 and
2337                          * we just go on with the next page in our bio. If it
2338                          * can't handle the error it will return -EIO and we
2339                          * remain responsible for that page.
2340                          */
2341                         ret = bio_readpage_error(bio, page, start, end,
2342                                                         failed_mirror, NULL);
2343                         if (ret == 0) {
2344 error_handled:
2345                                 uptodate =
2346                                         test_bit(BIO_UPTODATE, &bio->bi_flags);
2347                                 if (err)
2348                                         uptodate = 0;
2349                                 uncache_state(&cached);
2350                                 continue;
2351                         }
2352                         if (tree->ops && tree->ops->readpage_io_failed_hook) {
2353                                 ret = tree->ops->readpage_io_failed_hook(
2354                                                         bio, page, start, end,
2355                                                         failed_mirror, state);
2356                                 if (ret == 0)
2357                                         goto error_handled;
2358                         }
2359                 }
2360
2361                 if (uptodate) {
2362                         set_extent_uptodate(tree, start, end, &cached,
2363                                             GFP_ATOMIC);
2364                 }
2365                 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2366
2367                 if (whole_page) {
2368                         if (uptodate) {
2369                                 SetPageUptodate(page);
2370                         } else {
2371                                 ClearPageUptodate(page);
2372                                 SetPageError(page);
2373                         }
2374                         unlock_page(page);
2375                 } else {
2376                         if (uptodate) {
2377                                 check_page_uptodate(tree, page);
2378                         } else {
2379                                 ClearPageUptodate(page);
2380                                 SetPageError(page);
2381                         }
2382                         check_page_locked(tree, page);
2383                 }
2384         } while (bvec <= bvec_end);
2385
2386         bio_put(bio);
2387 }
2388
2389 struct bio *
2390 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2391                 gfp_t gfp_flags)
2392 {
2393         struct bio *bio;
2394
2395         bio = bio_alloc(gfp_flags, nr_vecs);
2396
2397         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2398                 while (!bio && (nr_vecs /= 2))
2399                         bio = bio_alloc(gfp_flags, nr_vecs);
2400         }
2401
2402         if (bio) {
2403                 bio->bi_size = 0;
2404                 bio->bi_bdev = bdev;
2405                 bio->bi_sector = first_sector;
2406         }
2407         return bio;
2408 }
2409
2410 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
2411                           unsigned long bio_flags)
2412 {
2413         int ret = 0;
2414         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2415         struct page *page = bvec->bv_page;
2416         struct extent_io_tree *tree = bio->bi_private;
2417         u64 start;
2418
2419         start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
2420
2421         bio->bi_private = NULL;
2422
2423         bio_get(bio);
2424
2425         if (tree->ops && tree->ops->submit_bio_hook)
2426                 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2427                                            mirror_num, bio_flags, start);
2428         else
2429                 submit_bio(rw, bio);
2430
2431         if (bio_flagged(bio, BIO_EOPNOTSUPP))
2432                 ret = -EOPNOTSUPP;
2433         bio_put(bio);
2434         return ret;
2435 }
2436
2437 static int submit_extent_page(int rw, struct extent_io_tree *tree,
2438                               struct page *page, sector_t sector,
2439                               size_t size, unsigned long offset,
2440                               struct block_device *bdev,
2441                               struct bio **bio_ret,
2442                               unsigned long max_pages,
2443                               bio_end_io_t end_io_func,
2444                               int mirror_num,
2445                               unsigned long prev_bio_flags,
2446                               unsigned long bio_flags)
2447 {
2448         int ret = 0;
2449         struct bio *bio;
2450         int nr;
2451         int contig = 0;
2452         int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2453         int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2454         size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2455
2456         if (bio_ret && *bio_ret) {
2457                 bio = *bio_ret;
2458                 if (old_compressed)
2459                         contig = bio->bi_sector == sector;
2460                 else
2461                         contig = bio->bi_sector + (bio->bi_size >> 9) ==
2462                                 sector;
2463
2464                 if (prev_bio_flags != bio_flags || !contig ||
2465                     (tree->ops && tree->ops->merge_bio_hook &&
2466                      tree->ops->merge_bio_hook(page, offset, page_size, bio,
2467                                                bio_flags)) ||
2468                     bio_add_page(bio, page, page_size, offset) < page_size) {
2469                         ret = submit_one_bio(rw, bio, mirror_num,
2470                                              prev_bio_flags);
2471                         bio = NULL;
2472                 } else {
2473                         return 0;
2474                 }
2475         }
2476         if (this_compressed)
2477                 nr = BIO_MAX_PAGES;
2478         else
2479                 nr = bio_get_nr_vecs(bdev);
2480
2481         bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2482         if (!bio)
2483                 return -ENOMEM;
2484
2485         bio_add_page(bio, page, page_size, offset);
2486         bio->bi_end_io = end_io_func;
2487         bio->bi_private = tree;
2488
2489         if (bio_ret)
2490                 *bio_ret = bio;
2491         else
2492                 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2493
2494         return ret;
2495 }
2496
2497 void set_page_extent_mapped(struct page *page)
2498 {
2499         if (!PagePrivate(page)) {
2500                 SetPagePrivate(page);
2501                 page_cache_get(page);
2502                 set_page_private(page, EXTENT_PAGE_PRIVATE);
2503         }
2504 }
2505
2506 static void set_page_extent_head(struct page *page, unsigned long len)
2507 {
2508         WARN_ON(!PagePrivate(page));
2509         set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
2510 }
2511
2512 /*
2513  * basic readpage implementation.  Locked extent state structs are inserted
2514  * into the tree that are removed when the IO is done (by the end_io
2515  * handlers)
2516  */
2517 static int __extent_read_full_page(struct extent_io_tree *tree,
2518                                    struct page *page,
2519                                    get_extent_t *get_extent,
2520                                    struct bio **bio, int mirror_num,
2521                                    unsigned long *bio_flags)
2522 {
2523         struct inode *inode = page->mapping->host;
2524         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2525         u64 page_end = start + PAGE_CACHE_SIZE - 1;
2526         u64 end;
2527         u64 cur = start;
2528         u64 extent_offset;
2529         u64 last_byte = i_size_read(inode);
2530         u64 block_start;
2531         u64 cur_end;
2532         sector_t sector;
2533         struct extent_map *em;
2534         struct block_device *bdev;
2535         struct btrfs_ordered_extent *ordered;
2536         int ret;
2537         int nr = 0;
2538         size_t pg_offset = 0;
2539         size_t iosize;
2540         size_t disk_io_size;
2541         size_t blocksize = inode->i_sb->s_blocksize;
2542         unsigned long this_bio_flag = 0;
2543
2544         set_page_extent_mapped(page);
2545
2546         if (!PageUptodate(page)) {
2547                 if (cleancache_get_page(page) == 0) {
2548                         BUG_ON(blocksize != PAGE_SIZE);
2549                         goto out;
2550                 }
2551         }
2552
2553         end = page_end;
2554         while (1) {
2555                 lock_extent(tree, start, end, GFP_NOFS);
2556                 ordered = btrfs_lookup_ordered_extent(inode, start);
2557                 if (!ordered)
2558                         break;
2559                 unlock_extent(tree, start, end, GFP_NOFS);
2560                 btrfs_start_ordered_extent(inode, ordered, 1);
2561                 btrfs_put_ordered_extent(ordered);
2562         }
2563
2564         if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2565                 char *userpage;
2566                 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2567
2568                 if (zero_offset) {
2569                         iosize = PAGE_CACHE_SIZE - zero_offset;
2570                         userpage = kmap_atomic(page, KM_USER0);
2571                         memset(userpage + zero_offset, 0, iosize);
2572                         flush_dcache_page(page);
2573                         kunmap_atomic(userpage, KM_USER0);
2574                 }
2575         }
2576         while (cur <= end) {
2577                 if (cur >= last_byte) {
2578                         char *userpage;
2579                         struct extent_state *cached = NULL;
2580
2581                         iosize = PAGE_CACHE_SIZE - pg_offset;
2582                         userpage = kmap_atomic(page, KM_USER0);
2583                         memset(userpage + pg_offset, 0, iosize);
2584                         flush_dcache_page(page);
2585                         kunmap_atomic(userpage, KM_USER0);
2586                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2587                                             &cached, GFP_NOFS);
2588                         unlock_extent_cached(tree, cur, cur + iosize - 1,
2589                                              &cached, GFP_NOFS);
2590                         break;
2591                 }
2592                 em = get_extent(inode, page, pg_offset, cur,
2593                                 end - cur + 1, 0);
2594                 if (IS_ERR_OR_NULL(em)) {
2595                         SetPageError(page);
2596                         unlock_extent(tree, cur, end, GFP_NOFS);
2597                         break;
2598                 }
2599                 extent_offset = cur - em->start;
2600                 BUG_ON(extent_map_end(em) <= cur);
2601                 BUG_ON(end < cur);
2602
2603                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2604                         this_bio_flag = EXTENT_BIO_COMPRESSED;
2605                         extent_set_compress_type(&this_bio_flag,
2606                                                  em->compress_type);
2607                 }
2608
2609                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2610                 cur_end = min(extent_map_end(em) - 1, end);
2611                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2612                 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2613                         disk_io_size = em->block_len;
2614                         sector = em->block_start >> 9;
2615                 } else {
2616                         sector = (em->block_start + extent_offset) >> 9;
2617                         disk_io_size = iosize;
2618                 }
2619                 bdev = em->bdev;
2620                 block_start = em->block_start;
2621                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2622                         block_start = EXTENT_MAP_HOLE;
2623                 free_extent_map(em);
2624                 em = NULL;
2625
2626                 /* we've found a hole, just zero and go on */
2627                 if (block_start == EXTENT_MAP_HOLE) {
2628                         char *userpage;
2629                         struct extent_state *cached = NULL;
2630
2631                         userpage = kmap_atomic(page, KM_USER0);
2632                         memset(userpage + pg_offset, 0, iosize);
2633                         flush_dcache_page(page);
2634                         kunmap_atomic(userpage, KM_USER0);
2635
2636                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2637                                             &cached, GFP_NOFS);
2638                         unlock_extent_cached(tree, cur, cur + iosize - 1,
2639                                              &cached, GFP_NOFS);
2640                         cur = cur + iosize;
2641                         pg_offset += iosize;
2642                         continue;
2643                 }
2644                 /* the get_extent function already copied into the page */
2645                 if (test_range_bit(tree, cur, cur_end,
2646                                    EXTENT_UPTODATE, 1, NULL)) {
2647                         check_page_uptodate(tree, page);
2648                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2649                         cur = cur + iosize;
2650                         pg_offset += iosize;
2651                         continue;
2652                 }
2653                 /* we have an inline extent but it didn't get marked up
2654                  * to date.  Error out
2655                  */
2656                 if (block_start == EXTENT_MAP_INLINE) {
2657                         SetPageError(page);
2658                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2659                         cur = cur + iosize;
2660                         pg_offset += iosize;
2661                         continue;
2662                 }
2663
2664                 ret = 0;
2665                 if (tree->ops && tree->ops->readpage_io_hook) {
2666                         ret = tree->ops->readpage_io_hook(page, cur,
2667                                                           cur + iosize - 1);
2668                 }
2669                 if (!ret) {
2670                         unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2671                         pnr -= page->index;
2672                         ret = submit_extent_page(READ, tree, page,
2673                                          sector, disk_io_size, pg_offset,
2674                                          bdev, bio, pnr,
2675                                          end_bio_extent_readpage, mirror_num,
2676                                          *bio_flags,
2677                                          this_bio_flag);
2678                         nr++;
2679                         *bio_flags = this_bio_flag;
2680                 }
2681                 if (ret)
2682                         SetPageError(page);
2683                 cur = cur + iosize;
2684                 pg_offset += iosize;
2685         }
2686 out:
2687         if (!nr) {
2688                 if (!PageError(page))
2689                         SetPageUptodate(page);
2690                 unlock_page(page);
2691         }
2692         return 0;
2693 }
2694
2695 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2696                             get_extent_t *get_extent, int mirror_num)
2697 {
2698         struct bio *bio = NULL;
2699         unsigned long bio_flags = 0;
2700         int ret;
2701
2702         ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2703                                       &bio_flags);
2704         if (bio)
2705                 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2706         return ret;
2707 }
2708
2709 static noinline void update_nr_written(struct page *page,
2710                                       struct writeback_control *wbc,
2711                                       unsigned long nr_written)
2712 {
2713         wbc->nr_to_write -= nr_written;
2714         if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2715             wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2716                 page->mapping->writeback_index = page->index + nr_written;
2717 }
2718
2719 /*
2720  * the writepage semantics are similar to regular writepage.  extent
2721  * records are inserted to lock ranges in the tree, and as dirty areas
2722  * are found, they are marked writeback.  Then the lock bits are removed
2723  * and the end_io handler clears the writeback ranges
2724  */
2725 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2726                               void *data)
2727 {
2728         struct inode *inode = page->mapping->host;
2729         struct extent_page_data *epd = data;
2730         struct extent_io_tree *tree = epd->tree;
2731         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2732         u64 delalloc_start;
2733         u64 page_end = start + PAGE_CACHE_SIZE - 1;
2734         u64 end;
2735         u64 cur = start;
2736         u64 extent_offset;
2737         u64 last_byte = i_size_read(inode);
2738         u64 block_start;
2739         u64 iosize;
2740         sector_t sector;
2741         struct extent_state *cached_state = NULL;
2742         struct extent_map *em;
2743         struct block_device *bdev;
2744         int ret;
2745         int nr = 0;
2746         size_t pg_offset = 0;
2747         size_t blocksize;
2748         loff_t i_size = i_size_read(inode);
2749         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2750         u64 nr_delalloc;
2751         u64 delalloc_end;
2752         int page_started;
2753         int compressed;
2754         int write_flags;
2755         unsigned long nr_written = 0;
2756         bool fill_delalloc = true;
2757
2758         if (wbc->sync_mode == WB_SYNC_ALL)
2759                 write_flags = WRITE_SYNC;
2760         else
2761                 write_flags = WRITE;
2762
2763         trace___extent_writepage(page, inode, wbc);
2764
2765         WARN_ON(!PageLocked(page));
2766
2767         ClearPageError(page);
2768
2769         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2770         if (page->index > end_index ||
2771            (page->index == end_index && !pg_offset)) {
2772                 page->mapping->a_ops->invalidatepage(page, 0);
2773                 unlock_page(page);
2774                 return 0;
2775         }
2776
2777         if (page->index == end_index) {
2778                 char *userpage;
2779
2780                 userpage = kmap_atomic(page, KM_USER0);
2781                 memset(userpage + pg_offset, 0,
2782                        PAGE_CACHE_SIZE - pg_offset);
2783                 kunmap_atomic(userpage, KM_USER0);
2784                 flush_dcache_page(page);
2785         }
2786         pg_offset = 0;
2787
2788         set_page_extent_mapped(page);
2789
2790         if (!tree->ops || !tree->ops->fill_delalloc)
2791                 fill_delalloc = false;
2792
2793         delalloc_start = start;
2794         delalloc_end = 0;
2795         page_started = 0;
2796         if (!epd->extent_locked && fill_delalloc) {
2797                 u64 delalloc_to_write = 0;
2798                 /*
2799                  * make sure the wbc mapping index is at least updated
2800                  * to this page.
2801                  */
2802                 update_nr_written(page, wbc, 0);
2803
2804                 while (delalloc_end < page_end) {
2805                         nr_delalloc = find_lock_delalloc_range(inode, tree,
2806                                                        page,
2807                                                        &delalloc_start,
2808                                                        &delalloc_end,
2809                                                        128 * 1024 * 1024);
2810                         if (nr_delalloc == 0) {
2811                                 delalloc_start = delalloc_end + 1;
2812                                 continue;
2813                         }
2814                         tree->ops->fill_delalloc(inode, page, delalloc_start,
2815                                                  delalloc_end, &page_started,
2816                                                  &nr_written);
2817                         /*
2818                          * delalloc_end is already one less than the total
2819                          * length, so we don't subtract one from
2820                          * PAGE_CACHE_SIZE
2821                          */
2822                         delalloc_to_write += (delalloc_end - delalloc_start +
2823                                               PAGE_CACHE_SIZE) >>
2824                                               PAGE_CACHE_SHIFT;
2825                         delalloc_start = delalloc_end + 1;
2826                 }
2827                 if (wbc->nr_to_write < delalloc_to_write) {
2828                         int thresh = 8192;
2829
2830                         if (delalloc_to_write < thresh * 2)
2831                                 thresh = delalloc_to_write;
2832                         wbc->nr_to_write = min_t(u64, delalloc_to_write,
2833                                                  thresh);
2834                 }
2835
2836                 /* did the fill delalloc function already unlock and start
2837                  * the IO?
2838                  */
2839                 if (page_started) {
2840                         ret = 0;
2841                         /*
2842                          * we've unlocked the page, so we can't update
2843                          * the mapping's writeback index, just update
2844                          * nr_to_write.
2845                          */
2846                         wbc->nr_to_write -= nr_written;
2847                         goto done_unlocked;
2848                 }
2849         }
2850         if (tree->ops && tree->ops->writepage_start_hook) {
2851                 ret = tree->ops->writepage_start_hook(page, start,
2852                                                       page_end);
2853                 if (ret == -EAGAIN) {
2854                         redirty_page_for_writepage(wbc, page);
2855                         update_nr_written(page, wbc, nr_written);
2856                         unlock_page(page);
2857                         ret = 0;
2858                         goto done_unlocked;
2859                 }
2860         }
2861
2862         /*
2863          * we don't want to touch the inode after unlocking the page,
2864          * so we update the mapping writeback index now
2865          */
2866         update_nr_written(page, wbc, nr_written + 1);
2867
2868         end = page_end;
2869         if (last_byte <= start) {
2870                 if (tree->ops && tree->ops->writepage_end_io_hook)
2871                         tree->ops->writepage_end_io_hook(page, start,
2872                                                          page_end, NULL, 1);
2873                 goto done;
2874         }
2875
2876         blocksize = inode->i_sb->s_blocksize;
2877
2878         while (cur <= end) {
2879                 if (cur >= last_byte) {
2880                         if (tree->ops && tree->ops->writepage_end_io_hook)
2881                                 tree->ops->writepage_end_io_hook(page, cur,
2882                                                          page_end, NULL, 1);
2883                         break;
2884                 }
2885                 em = epd->get_extent(inode, page, pg_offset, cur,
2886                                      end - cur + 1, 1);
2887                 if (IS_ERR_OR_NULL(em)) {
2888                         SetPageError(page);
2889                         break;
2890                 }
2891
2892                 extent_offset = cur - em->start;
2893                 BUG_ON(extent_map_end(em) <= cur);
2894                 BUG_ON(end < cur);
2895                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2896                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2897                 sector = (em->block_start + extent_offset) >> 9;
2898                 bdev = em->bdev;
2899                 block_start = em->block_start;
2900                 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2901                 free_extent_map(em);
2902                 em = NULL;
2903
2904                 /*
2905                  * compressed and inline extents are written through other
2906                  * paths in the FS
2907                  */
2908                 if (compressed || block_start == EXTENT_MAP_HOLE ||
2909                     block_start == EXTENT_MAP_INLINE) {
2910                         /*
2911                          * end_io notification does not happen here for
2912                          * compressed extents
2913                          */
2914                         if (!compressed && tree->ops &&
2915                             tree->ops->writepage_end_io_hook)
2916                                 tree->ops->writepage_end_io_hook(page, cur,
2917                                                          cur + iosize - 1,
2918                                                          NULL, 1);
2919                         else if (compressed) {
2920                                 /* we don't want to end_page_writeback on
2921                                  * a compressed extent.  this happens
2922                                  * elsewhere
2923                                  */
2924                                 nr++;
2925                         }
2926
2927                         cur += iosize;
2928                         pg_offset += iosize;
2929                         continue;
2930                 }
2931                 /* leave this out until we have a page_mkwrite call */
2932                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2933                                    EXTENT_DIRTY, 0, NULL)) {
2934                         cur = cur + iosize;
2935                         pg_offset += iosize;
2936                         continue;
2937                 }
2938
2939                 if (tree->ops && tree->ops->writepage_io_hook) {
2940                         ret = tree->ops->writepage_io_hook(page, cur,
2941                                                 cur + iosize - 1);
2942                 } else {
2943                         ret = 0;
2944                 }
2945                 if (ret) {
2946                         SetPageError(page);
2947                 } else {
2948                         unsigned long max_nr = end_index + 1;
2949
2950                         set_range_writeback(tree, cur, cur + iosize - 1);
2951                         if (!PageWriteback(page)) {
2952                                 printk(KERN_ERR "btrfs warning page %lu not "
2953                                        "writeback, cur %llu end %llu\n",
2954                                        page->index, (unsigned long long)cur,
2955                                        (unsigned long long)end);
2956                         }
2957
2958                         ret = submit_extent_page(write_flags, tree, page,
2959                                                  sector, iosize, pg_offset,
2960                                                  bdev, &epd->bio, max_nr,
2961                                                  end_bio_extent_writepage,
2962                                                  0, 0, 0);
2963                         if (ret)
2964                                 SetPageError(page);
2965                 }
2966                 cur = cur + iosize;
2967                 pg_offset += iosize;
2968                 nr++;
2969         }
2970 done:
2971         if (nr == 0) {
2972                 /* make sure the mapping tag for page dirty gets cleared */
2973                 set_page_writeback(page);
2974                 end_page_writeback(page);
2975         }
2976         unlock_page(page);
2977
2978 done_unlocked:
2979
2980         /* drop our reference on any cached states */
2981         free_extent_state(cached_state);
2982         return 0;
2983 }
2984
2985 /**
2986  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2987  * @mapping: address space structure to write
2988  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2989  * @writepage: function called for each page
2990  * @data: data passed to writepage function
2991  *
2992  * If a page is already under I/O, write_cache_pages() skips it, even
2993  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2994  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2995  * and msync() need to guarantee that all the data which was dirty at the time
2996  * the call was made get new I/O started against them.  If wbc->sync_mode is
2997  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2998  * existing IO to complete.
2999  */
3000 static int extent_write_cache_pages(struct extent_io_tree *tree,
3001                              struct address_space *mapping,
3002                              struct writeback_control *wbc,
3003                              writepage_t writepage, void *data,
3004                              void (*flush_fn)(void *))
3005 {
3006         int ret = 0;
3007         int done = 0;
3008         int nr_to_write_done = 0;
3009         struct pagevec pvec;
3010         int nr_pages;
3011         pgoff_t index;
3012         pgoff_t end;            /* Inclusive */
3013         int scanned = 0;
3014         int tag;
3015
3016         pagevec_init(&pvec, 0);
3017         if (wbc->range_cyclic) {
3018                 index = mapping->writeback_index; /* Start from prev offset */
3019                 end = -1;
3020         } else {
3021                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
3022                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
3023                 scanned = 1;
3024         }
3025         if (wbc->sync_mode == WB_SYNC_ALL)
3026                 tag = PAGECACHE_TAG_TOWRITE;
3027         else
3028                 tag = PAGECACHE_TAG_DIRTY;
3029 retry:
3030         if (wbc->sync_mode == WB_SYNC_ALL)
3031                 tag_pages_for_writeback(mapping, index, end);
3032         while (!done && !nr_to_write_done && (index <= end) &&
3033                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3034                         min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3035                 unsigned i;
3036
3037                 scanned = 1;
3038                 for (i = 0; i < nr_pages; i++) {
3039                         struct page *page = pvec.pages[i];
3040
3041                         /*
3042                          * At this point we hold neither mapping->tree_lock nor
3043                          * lock on the page itself: the page may be truncated or
3044                          * invalidated (changing page->mapping to NULL), or even
3045                          * swizzled back from swapper_space to tmpfs file
3046                          * mapping
3047                          */
3048                         if (tree->ops &&
3049                             tree->ops->write_cache_pages_lock_hook) {
3050                                 tree->ops->write_cache_pages_lock_hook(page,
3051                                                                data, flush_fn);
3052                         } else {
3053                                 if (!trylock_page(page)) {
3054                                         flush_fn(data);
3055                                         lock_page(page);
3056                                 }
3057                         }
3058
3059                         if (unlikely(page->mapping != mapping)) {
3060                                 unlock_page(page);
3061                                 continue;
3062                         }
3063
3064                         if (!wbc->range_cyclic && page->index > end) {
3065                                 done = 1;
3066                                 unlock_page(page);
3067                                 continue;
3068                         }
3069
3070                         if (wbc->sync_mode != WB_SYNC_NONE) {
3071                                 if (PageWriteback(page))
3072                                         flush_fn(data);
3073                                 wait_on_page_writeback(page);
3074                         }
3075
3076                         if (PageWriteback(page) ||
3077                             !clear_page_dirty_for_io(page)) {
3078                                 unlock_page(page);
3079                                 continue;
3080                         }
3081
3082                         ret = (*writepage)(page, wbc, data);
3083
3084                         if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3085                                 unlock_page(page);
3086                                 ret = 0;
3087                         }
3088                         if (ret)
3089                                 done = 1;
3090
3091                         /*
3092                          * the filesystem may choose to bump up nr_to_write.
3093                          * We have to make sure to honor the new nr_to_write
3094                          * at any time
3095                          */
3096                         nr_to_write_done = wbc->nr_to_write <= 0;
3097                 }
3098                 pagevec_release(&pvec);
3099                 cond_resched();
3100         }
3101         if (!scanned && !done) {
3102                 /*
3103                  * We hit the last page and there is more work to be done: wrap
3104                  * back to the start of the file
3105                  */
3106                 scanned = 1;
3107                 index = 0;
3108                 goto retry;
3109         }
3110         return ret;
3111 }
3112
3113 static void flush_epd_write_bio(struct extent_page_data *epd)
3114 {
3115         if (epd->bio) {
3116                 if (epd->sync_io)
3117                         submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
3118                 else
3119                         submit_one_bio(WRITE, epd->bio, 0, 0);
3120                 epd->bio = NULL;
3121         }
3122 }
3123
3124 static noinline void flush_write_bio(void *data)
3125 {
3126         struct extent_page_data *epd = data;
3127         flush_epd_write_bio(epd);
3128 }
3129
3130 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3131                           get_extent_t *get_extent,
3132                           struct writeback_control *wbc)
3133 {
3134         int ret;
3135         struct extent_page_data epd = {
3136                 .bio = NULL,
3137                 .tree = tree,
3138                 .get_extent = get_extent,
3139                 .extent_locked = 0,
3140                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3141         };
3142
3143         ret = __extent_writepage(page, wbc, &epd);
3144
3145         flush_epd_write_bio(&epd);
3146         return ret;
3147 }
3148
3149 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3150                               u64 start, u64 end, get_extent_t *get_extent,
3151                               int mode)
3152 {
3153         int ret = 0;
3154         struct address_space *mapping = inode->i_mapping;
3155         struct page *page;
3156         unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
3157                 PAGE_CACHE_SHIFT;
3158
3159         struct extent_page_data epd = {
3160                 .bio = NULL,
3161                 .tree = tree,
3162                 .get_extent = get_extent,
3163                 .extent_locked = 1,
3164                 .sync_io = mode == WB_SYNC_ALL,
3165         };
3166         struct writeback_control wbc_writepages = {
3167                 .sync_mode      = mode,
3168                 .nr_to_write    = nr_pages * 2,
3169                 .range_start    = start,
3170                 .range_end      = end + 1,
3171         };
3172
3173         while (start <= end) {
3174                 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
3175                 if (clear_page_dirty_for_io(page))
3176                         ret = __extent_writepage(page, &wbc_writepages, &epd);
3177                 else {
3178                         if (tree->ops && tree->ops->writepage_end_io_hook)
3179                                 tree->ops->writepage_end_io_hook(page, start,
3180                                                  start + PAGE_CACHE_SIZE - 1,
3181                                                  NULL, 1);
3182                         unlock_page(page);
3183                 }
3184                 page_cache_release(page);
3185                 start += PAGE_CACHE_SIZE;
3186         }
3187
3188         flush_epd_write_bio(&epd);
3189         return ret;
3190 }
3191
3192 int extent_writepages(struct extent_io_tree *tree,
3193                       struct address_space *mapping,
3194                       get_extent_t *get_extent,
3195                       struct writeback_control *wbc)
3196 {
3197         int ret = 0;
3198         struct extent_page_data epd = {
3199                 .bio = NULL,
3200                 .tree = tree,
3201                 .get_extent = get_extent,
3202                 .extent_locked = 0,
3203                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3204         };
3205
3206         ret = extent_write_cache_pages(tree, mapping, wbc,
3207                                        __extent_writepage, &epd,
3208                                        flush_write_bio);
3209         flush_epd_write_bio(&epd);
3210         return ret;
3211 }
3212
3213 int extent_readpages(struct extent_io_tree *tree,
3214                      struct address_space *mapping,
3215                      struct list_head *pages, unsigned nr_pages,
3216                      get_extent_t get_extent)
3217 {
3218         struct bio *bio = NULL;
3219         unsigned page_idx;
3220         unsigned long bio_flags = 0;
3221
3222         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3223                 struct page *page = list_entry(pages->prev, struct page, lru);
3224
3225                 prefetchw(&page->flags);
3226                 list_del(&page->lru);
3227                 if (!add_to_page_cache_lru(page, mapping,
3228                                         page->index, GFP_NOFS)) {
3229                         __extent_read_full_page(tree, page, get_extent,
3230                                                 &bio, 0, &bio_flags);
3231                 }
3232                 page_cache_release(page);
3233         }
3234         BUG_ON(!list_empty(pages));
3235         if (bio)
3236                 submit_one_bio(READ, bio, 0, bio_flags);
3237         return 0;
3238 }
3239
3240 /*
3241  * basic invalidatepage code, this waits on any locked or writeback
3242  * ranges corresponding to the page, and then deletes any extent state
3243  * records from the tree
3244  */
3245 int extent_invalidatepage(struct extent_io_tree *tree,
3246                           struct page *page, unsigned long offset)
3247 {
3248         struct extent_state *cached_state = NULL;
3249         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
3250         u64 end = start + PAGE_CACHE_SIZE - 1;
3251         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3252
3253         start += (offset + blocksize - 1) & ~(blocksize - 1);
3254         if (start > end)
3255                 return 0;
3256
3257         lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
3258         wait_on_page_writeback(page);
3259         clear_extent_bit(tree, start, end,
3260                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3261                          EXTENT_DO_ACCOUNTING,
3262                          1, 1, &cached_state, GFP_NOFS);
3263         return 0;
3264 }
3265
3266 /*
3267  * a helper for releasepage, this tests for areas of the page that
3268  * are locked or under IO and drops the related state bits if it is safe
3269  * to drop the page.
3270  */
3271 int try_release_extent_state(struct extent_map_tree *map,
3272                              struct extent_io_tree *tree, struct page *page,
3273                              gfp_t mask)
3274 {
3275         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3276         u64 end = start + PAGE_CACHE_SIZE - 1;
3277         int ret = 1;
3278
3279         if (test_range_bit(tree, start, end,
3280                            EXTENT_IOBITS, 0, NULL))
3281                 ret = 0;
3282         else {
3283                 if ((mask & GFP_NOFS) == GFP_NOFS)
3284                         mask = GFP_NOFS;
3285                 /*
3286                  * at this point we can safely clear everything except the
3287                  * locked bit and the nodatasum bit
3288                  */
3289                 ret = clear_extent_bit(tree, start, end,
3290                                  ~(EXTENT_LOCKED | EXTENT_NODATASUM),
3291                                  0, 0, NULL, mask);
3292
3293                 /* if clear_extent_bit failed for enomem reasons,
3294                  * we can't allow the release to continue.
3295                  */
3296                 if (ret < 0)
3297                         ret = 0;
3298                 else
3299                         ret = 1;
3300         }
3301         return ret;
3302 }
3303
3304 /*
3305  * a helper for releasepage.  As long as there are no locked extents
3306  * in the range corresponding to the page, both state records and extent
3307  * map records are removed
3308  */
3309 int try_release_extent_mapping(struct extent_map_tree *map,
3310                                struct extent_io_tree *tree, struct page *page,
3311                                gfp_t mask)
3312 {
3313         struct extent_map *em;
3314         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3315         u64 end = start + PAGE_CACHE_SIZE - 1;
3316
3317         if ((mask & __GFP_WAIT) &&
3318             page->mapping->host->i_size > 16 * 1024 * 1024) {
3319                 u64 len;
3320                 while (start <= end) {
3321                         len = end - start + 1;
3322                         write_lock(&map->lock);
3323                         em = lookup_extent_mapping(map, start, len);
3324                         if (IS_ERR_OR_NULL(em)) {
3325                                 write_unlock(&map->lock);
3326                                 break;
3327                         }
3328                         if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
3329                             em->start != start) {
3330                                 write_unlock(&map->lock);
3331                                 free_extent_map(em);
3332                                 break;
3333                         }
3334                         if (!test_range_bit(tree, em->start,
3335                                             extent_map_end(em) - 1,
3336                                             EXTENT_LOCKED | EXTENT_WRITEBACK,
3337                                             0, NULL)) {
3338                                 remove_extent_mapping(map, em);
3339                                 /* once for the rb tree */
3340                                 free_extent_map(em);
3341                         }
3342                         start = extent_map_end(em);
3343                         write_unlock(&map->lock);
3344
3345                         /* once for us */
3346                         free_extent_map(em);
3347                 }
3348         }
3349         return try_release_extent_state(map, tree, page, mask);
3350 }
3351
3352 /*
3353  * helper function for fiemap, which doesn't want to see any holes.
3354  * This maps until we find something past 'last'
3355  */
3356 static struct extent_map *get_extent_skip_holes(struct inode *inode,
3357                                                 u64 offset,
3358                                                 u64 last,
3359                                                 get_extent_t *get_extent)
3360 {
3361         u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
3362         struct extent_map *em;
3363         u64 len;
3364
3365         if (offset >= last)
3366                 return NULL;
3367
3368         while(1) {
3369                 len = last - offset;
3370                 if (len == 0)
3371                         break;
3372                 len = (len + sectorsize - 1) & ~(sectorsize - 1);
3373                 em = get_extent(inode, NULL, 0, offset, len, 0);
3374                 if (IS_ERR_OR_NULL(em))
3375                         return em;
3376
3377                 /* if this isn't a hole return it */
3378                 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
3379                     em->block_start != EXTENT_MAP_HOLE) {
3380                         return em;
3381                 }
3382
3383                 /* this is a hole, advance to the next extent */
3384                 offset = extent_map_end(em);
3385                 free_extent_map(em);
3386                 if (offset >= last)
3387                         break;
3388         }
3389         return NULL;
3390 }
3391
3392 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3393                 __u64 start, __u64 len, get_extent_t *get_extent)
3394 {
3395         int ret = 0;
3396         u64 off = start;
3397         u64 max = start + len;
3398         u32 flags = 0;
3399         u32 found_type;
3400         u64 last;
3401         u64 last_for_get_extent = 0;
3402         u64 disko = 0;
3403         u64 isize = i_size_read(inode);
3404         struct btrfs_key found_key;
3405         struct extent_map *em = NULL;
3406         struct extent_state *cached_state = NULL;
3407         struct btrfs_path *path;
3408         struct btrfs_file_extent_item *item;
3409         int end = 0;
3410         u64 em_start = 0;
3411         u64 em_len = 0;
3412         u64 em_end = 0;
3413         unsigned long emflags;
3414
3415         if (len == 0)
3416                 return -EINVAL;
3417
3418         path = btrfs_alloc_path();
3419         if (!path)
3420                 return -ENOMEM;
3421         path->leave_spinning = 1;
3422
3423         start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3424         len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3425
3426         /*
3427          * lookup the last file extent.  We're not using i_size here
3428          * because there might be preallocation past i_size
3429          */
3430         ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
3431                                        path, btrfs_ino(inode), -1, 0);
3432         if (ret < 0) {
3433                 btrfs_free_path(path);
3434                 return ret;
3435         }
3436         WARN_ON(!ret);
3437         path->slots[0]--;
3438         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3439                               struct btrfs_file_extent_item);
3440         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
3441         found_type = btrfs_key_type(&found_key);
3442
3443         /* No extents, but there might be delalloc bits */
3444         if (found_key.objectid != btrfs_ino(inode) ||
3445             found_type != BTRFS_EXTENT_DATA_KEY) {
3446                 /* have to trust i_size as the end */
3447                 last = (u64)-1;
3448                 last_for_get_extent = isize;
3449         } else {
3450                 /*
3451                  * remember the start of the last extent.  There are a
3452                  * bunch of different factors that go into the length of the
3453                  * extent, so its much less complex to remember where it started
3454                  */
3455                 last = found_key.offset;
3456                 last_for_get_extent = last + 1;
3457         }
3458         btrfs_free_path(path);
3459
3460         /*
3461          * we might have some extents allocated but more delalloc past those
3462          * extents.  so, we trust isize unless the start of the last extent is
3463          * beyond isize
3464          */
3465         if (last < isize) {
3466                 last = (u64)-1;
3467                 last_for_get_extent = isize;
3468         }
3469
3470         lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3471                          &cached_state, GFP_NOFS);
3472
3473         em = get_extent_skip_holes(inode, start, last_for_get_extent,
3474                                    get_extent);
3475         if (!em)
3476                 goto out;
3477         if (IS_ERR(em)) {
3478                 ret = PTR_ERR(em);
3479                 goto out;
3480         }
3481
3482         while (!end) {
3483                 u64 offset_in_extent;
3484
3485                 /* break if the extent we found is outside the range */
3486                 if (em->start >= max || extent_map_end(em) < off)
3487                         break;
3488
3489                 /*
3490                  * get_extent may return an extent that starts before our
3491                  * requested range.  We have to make sure the ranges
3492                  * we return to fiemap always move forward and don't
3493                  * overlap, so adjust the offsets here
3494                  */
3495                 em_start = max(em->start, off);
3496
3497                 /*
3498                  * record the offset from the start of the extent
3499                  * for adjusting the disk offset below
3500                  */
3501                 offset_in_extent = em_start - em->start;
3502                 em_end = extent_map_end(em);
3503                 em_len = em_end - em_start;
3504                 emflags = em->flags;
3505                 disko = 0;
3506                 flags = 0;
3507
3508                 /*
3509                  * bump off for our next call to get_extent
3510                  */
3511                 off = extent_map_end(em);
3512                 if (off >= max)
3513                         end = 1;
3514
3515                 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
3516                         end = 1;
3517                         flags |= FIEMAP_EXTENT_LAST;
3518                 } else if (em->block_start == EXTENT_MAP_INLINE) {
3519                         flags |= (FIEMAP_EXTENT_DATA_INLINE |
3520                                   FIEMAP_EXTENT_NOT_ALIGNED);
3521                 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
3522                         flags |= (FIEMAP_EXTENT_DELALLOC |
3523                                   FIEMAP_EXTENT_UNKNOWN);
3524                 } else {
3525                         disko = em->block_start + offset_in_extent;
3526                 }
3527                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3528                         flags |= FIEMAP_EXTENT_ENCODED;
3529
3530                 free_extent_map(em);
3531                 em = NULL;
3532                 if ((em_start >= last) || em_len == (u64)-1 ||
3533                    (last == (u64)-1 && isize <= em_end)) {
3534                         flags |= FIEMAP_EXTENT_LAST;
3535                         end = 1;
3536                 }
3537
3538                 /* now scan forward to see if this is really the last extent. */
3539                 em = get_extent_skip_holes(inode, off, last_for_get_extent,
3540                                            get_extent);
3541                 if (IS_ERR(em)) {
3542                         ret = PTR_ERR(em);
3543                         goto out;
3544                 }
3545                 if (!em) {
3546                         flags |= FIEMAP_EXTENT_LAST;
3547                         end = 1;
3548                 }
3549                 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3550                                               em_len, flags);
3551                 if (ret)
3552                         goto out_free;
3553         }
3554 out_free:
3555         free_extent_map(em);
3556 out:
3557         unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3558                              &cached_state, GFP_NOFS);
3559         return ret;
3560 }
3561
3562 inline struct page *extent_buffer_page(struct extent_buffer *eb,
3563                                               unsigned long i)
3564 {
3565         struct page *p;
3566         struct address_space *mapping;
3567
3568         if (i == 0)
3569                 return eb->first_page;
3570         i += eb->start >> PAGE_CACHE_SHIFT;
3571         mapping = eb->first_page->mapping;
3572         if (!mapping)
3573                 return NULL;
3574
3575         /*
3576          * extent_buffer_page is only called after pinning the page
3577          * by increasing the reference count.  So we know the page must
3578          * be in the radix tree.
3579          */
3580         rcu_read_lock();
3581         p = radix_tree_lookup(&mapping->page_tree, i);
3582         rcu_read_unlock();
3583
3584         return p;
3585 }
3586
3587 inline unsigned long num_extent_pages(u64 start, u64 len)
3588 {
3589         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3590                 (start >> PAGE_CACHE_SHIFT);
3591 }
3592
3593 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3594                                                    u64 start,
3595                                                    unsigned long len,
3596                                                    gfp_t mask)
3597 {
3598         struct extent_buffer *eb = NULL;
3599 #if LEAK_DEBUG
3600         unsigned long flags;
3601 #endif
3602
3603         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3604         if (eb == NULL)
3605                 return NULL;
3606         eb->start = start;
3607         eb->len = len;
3608         rwlock_init(&eb->lock);
3609         atomic_set(&eb->write_locks, 0);
3610         atomic_set(&eb->read_locks, 0);
3611         atomic_set(&eb->blocking_readers, 0);
3612         atomic_set(&eb->blocking_writers, 0);
3613         atomic_set(&eb->spinning_readers, 0);
3614         atomic_set(&eb->spinning_writers, 0);
3615         init_waitqueue_head(&eb->write_lock_wq);
3616         init_waitqueue_head(&eb->read_lock_wq);
3617
3618 #if LEAK_DEBUG
3619         spin_lock_irqsave(&leak_lock, flags);
3620         list_add(&eb->leak_list, &buffers);
3621         spin_unlock_irqrestore(&leak_lock, flags);
3622 #endif
3623         atomic_set(&eb->refs, 1);
3624
3625         return eb;
3626 }
3627
3628 static void __free_extent_buffer(struct extent_buffer *eb)
3629 {
3630 #if LEAK_DEBUG
3631         unsigned long flags;
3632         spin_lock_irqsave(&leak_lock, flags);
3633         list_del(&eb->leak_list);
3634         spin_unlock_irqrestore(&leak_lock, flags);
3635 #endif
3636         kmem_cache_free(extent_buffer_cache, eb);
3637 }
3638
3639 /*
3640  * Helper for releasing extent buffer page.
3641  */
3642 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3643                                                 unsigned long start_idx)
3644 {
3645         unsigned long index;
3646         struct page *page;
3647
3648         if (!eb->first_page)
3649                 return;
3650
3651         index = num_extent_pages(eb->start, eb->len);
3652         if (start_idx >= index)
3653                 return;
3654
3655         do {
3656                 index--;
3657                 page = extent_buffer_page(eb, index);
3658                 if (page)
3659                         page_cache_release(page);
3660         } while (index != start_idx);
3661 }
3662
3663 /*
3664  * Helper for releasing the extent buffer.
3665  */
3666 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3667 {
3668         btrfs_release_extent_buffer_page(eb, 0);
3669         __free_extent_buffer(eb);
3670 }
3671
3672 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3673                                           u64 start, unsigned long len,
3674                                           struct page *page0)
3675 {
3676         unsigned long num_pages = num_extent_pages(start, len);
3677         unsigned long i;
3678         unsigned long index = start >> PAGE_CACHE_SHIFT;
3679         struct extent_buffer *eb;
3680         struct extent_buffer *exists = NULL;
3681         struct page *p;
3682         struct address_space *mapping = tree->mapping;
3683         int uptodate = 1;
3684         int ret;
3685
3686         rcu_read_lock();
3687         eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3688         if (eb && atomic_inc_not_zero(&eb->refs)) {
3689                 rcu_read_unlock();
3690                 mark_page_accessed(eb->first_page);
3691                 return eb;
3692         }
3693         rcu_read_unlock();
3694
3695         eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
3696         if (!eb)
3697                 return NULL;
3698
3699         if (page0) {
3700                 eb->first_page = page0;
3701                 i = 1;
3702                 index++;
3703                 page_cache_get(page0);
3704                 mark_page_accessed(page0);
3705                 set_page_extent_mapped(page0);
3706                 set_page_extent_head(page0, len);
3707                 uptodate = PageUptodate(page0);
3708         } else {
3709                 i = 0;
3710         }
3711         for (; i < num_pages; i++, index++) {
3712                 p = find_or_create_page(mapping, index, GFP_NOFS);
3713                 if (!p) {
3714                         WARN_ON(1);
3715                         goto free_eb;
3716                 }
3717                 set_page_extent_mapped(p);
3718                 mark_page_accessed(p);
3719                 if (i == 0) {
3720                         eb->first_page = p;
3721                         set_page_extent_head(p, len);
3722                 } else {
3723                         set_page_private(p, EXTENT_PAGE_PRIVATE);
3724                 }
3725                 if (!PageUptodate(p))
3726                         uptodate = 0;
3727
3728                 /*
3729                  * see below about how we avoid a nasty race with release page
3730                  * and why we unlock later
3731                  */
3732                 if (i != 0)
3733                         unlock_page(p);
3734         }
3735         if (uptodate)
3736                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3737
3738         ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3739         if (ret)
3740                 goto free_eb;
3741
3742         spin_lock(&tree->buffer_lock);
3743         ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
3744         if (ret == -EEXIST) {
3745                 exists = radix_tree_lookup(&tree->buffer,
3746                                                 start >> PAGE_CACHE_SHIFT);
3747                 /* add one reference for the caller */
3748                 atomic_inc(&exists->refs);
3749                 spin_unlock(&tree->buffer_lock);
3750                 radix_tree_preload_end();
3751                 goto free_eb;
3752         }
3753         /* add one reference for the tree */
3754         atomic_inc(&eb->refs);
3755         spin_unlock(&tree->buffer_lock);
3756         radix_tree_preload_end();
3757
3758         /*
3759          * there is a race where release page may have
3760          * tried to find this extent buffer in the radix
3761          * but failed.  It will tell the VM it is safe to
3762          * reclaim the, and it will clear the page private bit.
3763          * We must make sure to set the page private bit properly
3764          * after the extent buffer is in the radix tree so
3765          * it doesn't get lost
3766          */
3767         set_page_extent_mapped(eb->first_page);
3768         set_page_extent_head(eb->first_page, eb->len);
3769         if (!page0)
3770                 unlock_page(eb->first_page);
3771         return eb;
3772
3773 free_eb:
3774         if (eb->first_page && !page0)
3775                 unlock_page(eb->first_page);
3776
3777         if (!atomic_dec_and_test(&eb->refs))
3778                 return exists;
3779         btrfs_release_extent_buffer(eb);
3780         return exists;
3781 }
3782
3783 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3784                                          u64 start, unsigned long len)
3785 {
3786         struct extent_buffer *eb;
3787
3788         rcu_read_lock();
3789         eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3790         if (eb && atomic_inc_not_zero(&eb->refs)) {
3791                 rcu_read_unlock();
3792                 mark_page_accessed(eb->first_page);
3793                 return eb;
3794         }
3795         rcu_read_unlock();
3796
3797         return NULL;
3798 }
3799
3800 void free_extent_buffer(struct extent_buffer *eb)
3801 {
3802         if (!eb)
3803                 return;
3804
3805         if (!atomic_dec_and_test(&eb->refs))
3806                 return;
3807
3808         WARN_ON(1);
3809 }
3810
3811 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3812                               struct extent_buffer *eb)
3813 {
3814         unsigned long i;
3815         unsigned long num_pages;
3816         struct page *page;
3817
3818         num_pages = num_extent_pages(eb->start, eb->len);
3819
3820         for (i = 0; i < num_pages; i++) {
3821                 page = extent_buffer_page(eb, i);
3822                 if (!PageDirty(page))
3823                         continue;
3824
3825                 lock_page(page);
3826                 WARN_ON(!PagePrivate(page));
3827
3828                 set_page_extent_mapped(page);
3829                 if (i == 0)
3830                         set_page_extent_head(page, eb->len);
3831
3832                 clear_page_dirty_for_io(page);
3833                 spin_lock_irq(&page->mapping->tree_lock);
3834                 if (!PageDirty(page)) {
3835                         radix_tree_tag_clear(&page->mapping->page_tree,
3836                                                 page_index(page),
3837                                                 PAGECACHE_TAG_DIRTY);
3838                 }
3839                 spin_unlock_irq(&page->mapping->tree_lock);
3840                 ClearPageError(page);
3841                 unlock_page(page);
3842         }
3843         return 0;
3844 }
3845
3846 int set_extent_buffer_dirty(struct extent_io_tree *tree,
3847                              struct extent_buffer *eb)
3848 {
3849         unsigned long i;
3850         unsigned long num_pages;
3851         int was_dirty = 0;
3852
3853         was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3854         num_pages = num_extent_pages(eb->start, eb->len);
3855         for (i = 0; i < num_pages; i++)
3856                 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3857         return was_dirty;
3858 }
3859
3860 static int __eb_straddles_pages(u64 start, u64 len)
3861 {
3862         if (len < PAGE_CACHE_SIZE)
3863                 return 1;
3864         if (start & (PAGE_CACHE_SIZE - 1))
3865                 return 1;
3866         if ((start + len) & (PAGE_CACHE_SIZE - 1))
3867                 return 1;
3868         return 0;
3869 }
3870
3871 static int eb_straddles_pages(struct extent_buffer *eb)
3872 {
3873         return __eb_straddles_pages(eb->start, eb->len);
3874 }
3875
3876 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3877                                 struct extent_buffer *eb,
3878                                 struct extent_state **cached_state)
3879 {
3880         unsigned long i;
3881         struct page *page;
3882         unsigned long num_pages;
3883
3884         num_pages = num_extent_pages(eb->start, eb->len);
3885         clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3886
3887         if (eb_straddles_pages(eb)) {
3888                 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3889                                       cached_state, GFP_NOFS);
3890         }
3891         for (i = 0; i < num_pages; i++) {
3892                 page = extent_buffer_page(eb, i);
3893                 if (page)
3894                         ClearPageUptodate(page);
3895         }
3896         return 0;
3897 }
3898
3899 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3900                                 struct extent_buffer *eb)
3901 {
3902         unsigned long i;
3903         struct page *page;
3904         unsigned long num_pages;
3905
3906         num_pages = num_extent_pages(eb->start, eb->len);
3907
3908         if (eb_straddles_pages(eb)) {
3909                 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3910                                     NULL, GFP_NOFS);
3911         }
3912         for (i = 0; i < num_pages; i++) {
3913                 page = extent_buffer_page(eb, i);
3914                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3915                     ((i == num_pages - 1) &&
3916                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3917                         check_page_uptodate(tree, page);
3918                         continue;
3919                 }
3920                 SetPageUptodate(page);
3921         }
3922         return 0;
3923 }
3924
3925 int extent_range_uptodate(struct extent_io_tree *tree,
3926                           u64 start, u64 end)
3927 {
3928         struct page *page;
3929         int ret;
3930         int pg_uptodate = 1;
3931         int uptodate;
3932         unsigned long index;
3933
3934         if (__eb_straddles_pages(start, end - start + 1)) {
3935                 ret = test_range_bit(tree, start, end,
3936                                      EXTENT_UPTODATE, 1, NULL);
3937                 if (ret)
3938                         return 1;
3939         }
3940         while (start <= end) {
3941                 index = start >> PAGE_CACHE_SHIFT;
3942                 page = find_get_page(tree->mapping, index);
3943                 uptodate = PageUptodate(page);
3944                 page_cache_release(page);
3945                 if (!uptodate) {
3946                         pg_uptodate = 0;
3947                         break;
3948                 }
3949                 start += PAGE_CACHE_SIZE;
3950         }
3951         return pg_uptodate;
3952 }
3953
3954 int extent_buffer_uptodate(struct extent_io_tree *tree,
3955                            struct extent_buffer *eb,
3956                            struct extent_state *cached_state)
3957 {
3958         int ret = 0;
3959         unsigned long num_pages;
3960         unsigned long i;
3961         struct page *page;
3962         int pg_uptodate = 1;
3963
3964         if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3965                 return 1;
3966
3967         if (eb_straddles_pages(eb)) {
3968                 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3969                                    EXTENT_UPTODATE, 1, cached_state);
3970                 if (ret)
3971                         return ret;
3972         }
3973
3974         num_pages = num_extent_pages(eb->start, eb->len);
3975         for (i = 0; i < num_pages; i++) {
3976                 page = extent_buffer_page(eb, i);
3977                 if (!PageUptodate(page)) {
3978                         pg_uptodate = 0;
3979                         break;
3980                 }
3981         }
3982         return pg_uptodate;
3983 }
3984
3985 int read_extent_buffer_pages(struct extent_io_tree *tree,
3986                              struct extent_buffer *eb, u64 start, int wait,
3987                              get_extent_t *get_extent, int mirror_num)
3988 {
3989         unsigned long i;
3990         unsigned long start_i;
3991         struct page *page;
3992         int err;
3993         int ret = 0;
3994         int locked_pages = 0;
3995         int all_uptodate = 1;
3996         int inc_all_pages = 0;
3997         unsigned long num_pages;
3998         struct bio *bio = NULL;
3999         unsigned long bio_flags = 0;
4000
4001         if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
4002                 return 0;
4003
4004         if (eb_straddles_pages(eb)) {
4005                 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
4006                                    EXTENT_UPTODATE, 1, NULL)) {
4007                         return 0;
4008                 }
4009         }
4010
4011         if (start) {
4012                 WARN_ON(start < eb->start);
4013                 start_i = (start >> PAGE_CACHE_SHIFT) -
4014                         (eb->start >> PAGE_CACHE_SHIFT);
4015         } else {
4016                 start_i = 0;
4017         }
4018
4019         num_pages = num_extent_pages(eb->start, eb->len);
4020         for (i = start_i; i < num_pages; i++) {
4021                 page = extent_buffer_page(eb, i);
4022                 if (wait == WAIT_NONE) {
4023                         if (!trylock_page(page))
4024                                 goto unlock_exit;
4025                 } else {
4026                         lock_page(page);
4027                 }
4028                 locked_pages++;
4029                 if (!PageUptodate(page))
4030                         all_uptodate = 0;
4031         }
4032         if (all_uptodate) {
4033                 if (start_i == 0)
4034                         set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4035                 goto unlock_exit;
4036         }
4037
4038         for (i = start_i; i < num_pages; i++) {
4039                 page = extent_buffer_page(eb, i);
4040
4041                 WARN_ON(!PagePrivate(page));
4042
4043                 set_page_extent_mapped(page);
4044                 if (i == 0)
4045                         set_page_extent_head(page, eb->len);
4046
4047                 if (inc_all_pages)
4048                         page_cache_get(page);
4049                 if (!PageUptodate(page)) {
4050                         if (start_i == 0)
4051                                 inc_all_pages = 1;
4052                         ClearPageError(page);
4053                         err = __extent_read_full_page(tree, page,
4054                                                       get_extent, &bio,
4055                                                       mirror_num, &bio_flags);
4056                         if (err)
4057                                 ret = err;
4058                 } else {
4059                         unlock_page(page);
4060                 }
4061         }
4062
4063         if (bio)
4064                 submit_one_bio(READ, bio, mirror_num, bio_flags);
4065
4066         if (ret || wait != WAIT_COMPLETE)
4067                 return ret;
4068
4069         for (i = start_i; i < num_pages; i++) {
4070                 page = extent_buffer_page(eb, i);
4071                 wait_on_page_locked(page);
4072                 if (!PageUptodate(page))
4073                         ret = -EIO;
4074         }
4075
4076         if (!ret)
4077                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4078         return ret;
4079
4080 unlock_exit:
4081         i = start_i;
4082         while (locked_pages > 0) {
4083                 page = extent_buffer_page(eb, i);
4084                 i++;
4085                 unlock_page(page);
4086                 locked_pages--;
4087         }
4088         return ret;
4089 }
4090
4091 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
4092                         unsigned long start,
4093                         unsigned long len)
4094 {
4095         size_t cur;
4096         size_t offset;
4097         struct page *page;
4098         char *kaddr;
4099         char *dst = (char *)dstv;
4100         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4101         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4102
4103         WARN_ON(start > eb->len);
4104         WARN_ON(start + len > eb->start + eb->len);
4105
4106         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4107
4108         while (len > 0) {
4109                 page = extent_buffer_page(eb, i);
4110
4111                 cur = min(len, (PAGE_CACHE_SIZE - offset));
4112                 kaddr = page_address(page);
4113                 memcpy(dst, kaddr + offset, cur);
4114
4115                 dst += cur;
4116                 len -= cur;
4117                 offset = 0;
4118                 i++;
4119         }
4120 }
4121
4122 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4123                                unsigned long min_len, char **map,
4124                                unsigned long *map_start,
4125                                unsigned long *map_len)
4126 {
4127         size_t offset = start & (PAGE_CACHE_SIZE - 1);
4128         char *kaddr;
4129         struct page *p;
4130         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4131         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4132         unsigned long end_i = (start_offset + start + min_len - 1) >>
4133                 PAGE_CACHE_SHIFT;
4134
4135         if (i != end_i)
4136                 return -EINVAL;
4137
4138         if (i == 0) {
4139                 offset = start_offset;
4140                 *map_start = 0;
4141         } else {
4142                 offset = 0;
4143                 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
4144         }
4145
4146         if (start + min_len > eb->len) {
4147                 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4148                        "wanted %lu %lu\n", (unsigned long long)eb->start,
4149                        eb->len, start, min_len);
4150                 WARN_ON(1);
4151                 return -EINVAL;
4152         }
4153
4154         p = extent_buffer_page(eb, i);
4155         kaddr = page_address(p);
4156         *map = kaddr + offset;
4157         *map_len = PAGE_CACHE_SIZE - offset;
4158         return 0;
4159 }
4160
4161 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
4162                           unsigned long start,
4163                           unsigned long len)
4164 {
4165         size_t cur;
4166         size_t offset;
4167         struct page *page;
4168         char *kaddr;
4169         char *ptr = (char *)ptrv;
4170         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4171         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4172         int ret = 0;
4173
4174         WARN_ON(start > eb->len);
4175         WARN_ON(start + len > eb->start + eb->len);
4176
4177         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4178
4179         while (len > 0) {
4180                 page = extent_buffer_page(eb, i);
4181
4182                 cur = min(len, (PAGE_CACHE_SIZE - offset));
4183
4184                 kaddr = page_address(page);
4185                 ret = memcmp(ptr, kaddr + offset, cur);
4186                 if (ret)
4187                         break;
4188
4189                 ptr += cur;
4190                 len -= cur;
4191                 offset = 0;
4192                 i++;
4193         }
4194         return ret;
4195 }
4196
4197 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
4198                          unsigned long start, unsigned long len)
4199 {
4200         size_t cur;
4201         size_t offset;
4202         struct page *page;
4203         char *kaddr;
4204         char *src = (char *)srcv;
4205         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4206         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4207
4208         WARN_ON(start > eb->len);
4209         WARN_ON(start + len > eb->start + eb->len);
4210
4211         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4212
4213         while (len > 0) {
4214                 page = extent_buffer_page(eb, i);
4215                 WARN_ON(!PageUptodate(page));
4216
4217                 cur = min(len, PAGE_CACHE_SIZE - offset);
4218                 kaddr = page_address(page);
4219                 memcpy(kaddr + offset, src, cur);
4220
4221                 src += cur;
4222                 len -= cur;
4223                 offset = 0;
4224                 i++;
4225         }
4226 }
4227
4228 void memset_extent_buffer(struct extent_buffer *eb, char c,
4229                           unsigned long start, unsigned long len)
4230 {
4231         size_t cur;
4232         size_t offset;
4233         struct page *page;
4234         char *kaddr;
4235         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4236         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4237
4238         WARN_ON(start > eb->len);
4239         WARN_ON(start + len > eb->start + eb->len);
4240
4241         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4242
4243         while (len > 0) {
4244                 page = extent_buffer_page(eb, i);
4245                 WARN_ON(!PageUptodate(page));
4246
4247                 cur = min(len, PAGE_CACHE_SIZE - offset);
4248                 kaddr = page_address(page);
4249                 memset(kaddr + offset, c, cur);
4250
4251                 len -= cur;
4252                 offset = 0;
4253                 i++;
4254         }
4255 }
4256
4257 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
4258                         unsigned long dst_offset, unsigned long src_offset,
4259                         unsigned long len)
4260 {
4261         u64 dst_len = dst->len;
4262         size_t cur;
4263         size_t offset;
4264         struct page *page;
4265         char *kaddr;
4266         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4267         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4268
4269         WARN_ON(src->len != dst_len);
4270
4271         offset = (start_offset + dst_offset) &
4272                 ((unsigned long)PAGE_CACHE_SIZE - 1);
4273
4274         while (len > 0) {
4275                 page = extent_buffer_page(dst, i);
4276                 WARN_ON(!PageUptodate(page));
4277
4278                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
4279
4280                 kaddr = page_address(page);
4281                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
4282
4283                 src_offset += cur;
4284                 len -= cur;
4285                 offset = 0;
4286                 i++;
4287         }
4288 }
4289
4290 static void move_pages(struct page *dst_page, struct page *src_page,
4291                        unsigned long dst_off, unsigned long src_off,
4292                        unsigned long len)
4293 {
4294         char *dst_kaddr = page_address(dst_page);
4295         if (dst_page == src_page) {
4296                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
4297         } else {
4298                 char *src_kaddr = page_address(src_page);
4299                 char *p = dst_kaddr + dst_off + len;
4300                 char *s = src_kaddr + src_off + len;
4301
4302                 while (len--)
4303                         *--p = *--s;
4304         }
4305 }
4306
4307 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4308 {
4309         unsigned long distance = (src > dst) ? src - dst : dst - src;
4310         return distance < len;
4311 }
4312
4313 static void copy_pages(struct page *dst_page, struct page *src_page,
4314                        unsigned long dst_off, unsigned long src_off,
4315                        unsigned long len)
4316 {
4317         char *dst_kaddr = page_address(dst_page);
4318         char *src_kaddr;
4319
4320         if (dst_page != src_page) {
4321                 src_kaddr = page_address(src_page);
4322         } else {
4323                 src_kaddr = dst_kaddr;
4324                 BUG_ON(areas_overlap(src_off, dst_off, len));
4325         }
4326
4327         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4328 }
4329
4330 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4331                            unsigned long src_offset, unsigned long len)
4332 {
4333         size_t cur;
4334         size_t dst_off_in_page;
4335         size_t src_off_in_page;
4336         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4337         unsigned long dst_i;
4338         unsigned long src_i;
4339
4340         if (src_offset + len > dst->len) {
4341                 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4342                        "len %lu dst len %lu\n", src_offset, len, dst->len);
4343                 BUG_ON(1);
4344         }
4345         if (dst_offset + len > dst->len) {
4346                 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4347                        "len %lu dst len %lu\n", dst_offset, len, dst->len);
4348                 BUG_ON(1);
4349         }
4350
4351         while (len > 0) {
4352                 dst_off_in_page = (start_offset + dst_offset) &
4353                         ((unsigned long)PAGE_CACHE_SIZE - 1);
4354                 src_off_in_page = (start_offset + src_offset) &
4355                         ((unsigned long)PAGE_CACHE_SIZE - 1);
4356
4357                 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4358                 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
4359
4360                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
4361                                                src_off_in_page));
4362                 cur = min_t(unsigned long, cur,
4363                         (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
4364
4365                 copy_pages(extent_buffer_page(dst, dst_i),
4366                            extent_buffer_page(dst, src_i),
4367                            dst_off_in_page, src_off_in_page, cur);
4368
4369                 src_offset += cur;
4370                 dst_offset += cur;
4371                 len -= cur;
4372         }
4373 }
4374
4375 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4376                            unsigned long src_offset, unsigned long len)
4377 {
4378         size_t cur;
4379         size_t dst_off_in_page;
4380         size_t src_off_in_page;
4381         unsigned long dst_end = dst_offset + len - 1;
4382         unsigned long src_end = src_offset + len - 1;
4383         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4384         unsigned long dst_i;
4385         unsigned long src_i;
4386
4387         if (src_offset + len > dst->len) {
4388                 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4389                        "len %lu len %lu\n", src_offset, len, dst->len);
4390                 BUG_ON(1);
4391         }
4392         if (dst_offset + len > dst->len) {
4393                 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4394                        "len %lu len %lu\n", dst_offset, len, dst->len);
4395                 BUG_ON(1);
4396         }
4397         if (!areas_overlap(src_offset, dst_offset, len)) {
4398                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4399                 return;
4400         }
4401         while (len > 0) {
4402                 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
4403                 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
4404
4405                 dst_off_in_page = (start_offset + dst_end) &
4406                         ((unsigned long)PAGE_CACHE_SIZE - 1);
4407                 src_off_in_page = (start_offset + src_end) &
4408                         ((unsigned long)PAGE_CACHE_SIZE - 1);
4409
4410                 cur = min_t(unsigned long, len, src_off_in_page + 1);
4411                 cur = min(cur, dst_off_in_page + 1);
4412                 move_pages(extent_buffer_page(dst, dst_i),
4413                            extent_buffer_page(dst, src_i),
4414                            dst_off_in_page - cur + 1,
4415                            src_off_in_page - cur + 1, cur);
4416
4417                 dst_end -= cur;
4418                 src_end -= cur;
4419                 len -= cur;
4420         }
4421 }
4422
4423 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4424 {
4425         struct extent_buffer *eb =
4426                         container_of(head, struct extent_buffer, rcu_head);
4427
4428         btrfs_release_extent_buffer(eb);
4429 }
4430
4431 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
4432 {
4433         u64 start = page_offset(page);
4434         struct extent_buffer *eb;
4435         int ret = 1;
4436
4437         spin_lock(&tree->buffer_lock);
4438         eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4439         if (!eb) {
4440                 spin_unlock(&tree->buffer_lock);
4441                 return ret;
4442         }
4443
4444         if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4445                 ret = 0;
4446                 goto out;
4447         }
4448
4449         /*
4450          * set @eb->refs to 0 if it is already 1, and then release the @eb.
4451          * Or go back.
4452          */
4453         if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
4454                 ret = 0;
4455                 goto out;
4456         }
4457
4458         radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4459 out:
4460         spin_unlock(&tree->buffer_lock);
4461
4462         /* at this point we can safely release the extent buffer */
4463         if (atomic_read(&eb->refs) == 0)
4464                 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4465         return ret;
4466 }