Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
[pandora-kernel.git] / fs / btrfs / free-space-cache.c
1 /*
2  * Copyright (C) 2008 Red Hat.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/pagemap.h>
20 #include <linux/sched.h>
21 #include <linux/math64.h>
22 #include "ctree.h"
23 #include "free-space-cache.h"
24 #include "transaction.h"
25
26 #define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
27 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
28
29 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
30                                           u64 offset)
31 {
32         BUG_ON(offset < bitmap_start);
33         offset -= bitmap_start;
34         return (unsigned long)(div64_u64(offset, sectorsize));
35 }
36
37 static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
38 {
39         return (unsigned long)(div64_u64(bytes, sectorsize));
40 }
41
42 static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
43                                    u64 offset)
44 {
45         u64 bitmap_start;
46         u64 bytes_per_bitmap;
47
48         bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
49         bitmap_start = offset - block_group->key.objectid;
50         bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
51         bitmap_start *= bytes_per_bitmap;
52         bitmap_start += block_group->key.objectid;
53
54         return bitmap_start;
55 }
56
57 static int tree_insert_offset(struct rb_root *root, u64 offset,
58                               struct rb_node *node, int bitmap)
59 {
60         struct rb_node **p = &root->rb_node;
61         struct rb_node *parent = NULL;
62         struct btrfs_free_space *info;
63
64         while (*p) {
65                 parent = *p;
66                 info = rb_entry(parent, struct btrfs_free_space, offset_index);
67
68                 if (offset < info->offset) {
69                         p = &(*p)->rb_left;
70                 } else if (offset > info->offset) {
71                         p = &(*p)->rb_right;
72                 } else {
73                         /*
74                          * we could have a bitmap entry and an extent entry
75                          * share the same offset.  If this is the case, we want
76                          * the extent entry to always be found first if we do a
77                          * linear search through the tree, since we want to have
78                          * the quickest allocation time, and allocating from an
79                          * extent is faster than allocating from a bitmap.  So
80                          * if we're inserting a bitmap and we find an entry at
81                          * this offset, we want to go right, or after this entry
82                          * logically.  If we are inserting an extent and we've
83                          * found a bitmap, we want to go left, or before
84                          * logically.
85                          */
86                         if (bitmap) {
87                                 WARN_ON(info->bitmap);
88                                 p = &(*p)->rb_right;
89                         } else {
90                                 WARN_ON(!info->bitmap);
91                                 p = &(*p)->rb_left;
92                         }
93                 }
94         }
95
96         rb_link_node(node, parent, p);
97         rb_insert_color(node, root);
98
99         return 0;
100 }
101
102 /*
103  * searches the tree for the given offset.
104  *
105  * fuzzy - If this is set, then we are trying to make an allocation, and we just
106  * want a section that has at least bytes size and comes at or after the given
107  * offset.
108  */
109 static struct btrfs_free_space *
110 tree_search_offset(struct btrfs_block_group_cache *block_group,
111                    u64 offset, int bitmap_only, int fuzzy)
112 {
113         struct rb_node *n = block_group->free_space_offset.rb_node;
114         struct btrfs_free_space *entry, *prev = NULL;
115
116         /* find entry that is closest to the 'offset' */
117         while (1) {
118                 if (!n) {
119                         entry = NULL;
120                         break;
121                 }
122
123                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
124                 prev = entry;
125
126                 if (offset < entry->offset)
127                         n = n->rb_left;
128                 else if (offset > entry->offset)
129                         n = n->rb_right;
130                 else
131                         break;
132         }
133
134         if (bitmap_only) {
135                 if (!entry)
136                         return NULL;
137                 if (entry->bitmap)
138                         return entry;
139
140                 /*
141                  * bitmap entry and extent entry may share same offset,
142                  * in that case, bitmap entry comes after extent entry.
143                  */
144                 n = rb_next(n);
145                 if (!n)
146                         return NULL;
147                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
148                 if (entry->offset != offset)
149                         return NULL;
150
151                 WARN_ON(!entry->bitmap);
152                 return entry;
153         } else if (entry) {
154                 if (entry->bitmap) {
155                         /*
156                          * if previous extent entry covers the offset,
157                          * we should return it instead of the bitmap entry
158                          */
159                         n = &entry->offset_index;
160                         while (1) {
161                                 n = rb_prev(n);
162                                 if (!n)
163                                         break;
164                                 prev = rb_entry(n, struct btrfs_free_space,
165                                                 offset_index);
166                                 if (!prev->bitmap) {
167                                         if (prev->offset + prev->bytes > offset)
168                                                 entry = prev;
169                                         break;
170                                 }
171                         }
172                 }
173                 return entry;
174         }
175
176         if (!prev)
177                 return NULL;
178
179         /* find last entry before the 'offset' */
180         entry = prev;
181         if (entry->offset > offset) {
182                 n = rb_prev(&entry->offset_index);
183                 if (n) {
184                         entry = rb_entry(n, struct btrfs_free_space,
185                                         offset_index);
186                         BUG_ON(entry->offset > offset);
187                 } else {
188                         if (fuzzy)
189                                 return entry;
190                         else
191                                 return NULL;
192                 }
193         }
194
195         if (entry->bitmap) {
196                 n = &entry->offset_index;
197                 while (1) {
198                         n = rb_prev(n);
199                         if (!n)
200                                 break;
201                         prev = rb_entry(n, struct btrfs_free_space,
202                                         offset_index);
203                         if (!prev->bitmap) {
204                                 if (prev->offset + prev->bytes > offset)
205                                         return prev;
206                                 break;
207                         }
208                 }
209                 if (entry->offset + BITS_PER_BITMAP *
210                     block_group->sectorsize > offset)
211                         return entry;
212         } else if (entry->offset + entry->bytes > offset)
213                 return entry;
214
215         if (!fuzzy)
216                 return NULL;
217
218         while (1) {
219                 if (entry->bitmap) {
220                         if (entry->offset + BITS_PER_BITMAP *
221                             block_group->sectorsize > offset)
222                                 break;
223                 } else {
224                         if (entry->offset + entry->bytes > offset)
225                                 break;
226                 }
227
228                 n = rb_next(&entry->offset_index);
229                 if (!n)
230                         return NULL;
231                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
232         }
233         return entry;
234 }
235
236 static void unlink_free_space(struct btrfs_block_group_cache *block_group,
237                               struct btrfs_free_space *info)
238 {
239         rb_erase(&info->offset_index, &block_group->free_space_offset);
240         block_group->free_extents--;
241         block_group->free_space -= info->bytes;
242 }
243
244 static int link_free_space(struct btrfs_block_group_cache *block_group,
245                            struct btrfs_free_space *info)
246 {
247         int ret = 0;
248
249         BUG_ON(!info->bitmap && !info->bytes);
250         ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
251                                  &info->offset_index, (info->bitmap != NULL));
252         if (ret)
253                 return ret;
254
255         block_group->free_space += info->bytes;
256         block_group->free_extents++;
257         return ret;
258 }
259
260 static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
261 {
262         u64 max_bytes;
263         u64 bitmap_bytes;
264         u64 extent_bytes;
265
266         /*
267          * The goal is to keep the total amount of memory used per 1gb of space
268          * at or below 32k, so we need to adjust how much memory we allow to be
269          * used by extent based free space tracking
270          */
271         max_bytes = MAX_CACHE_BYTES_PER_GIG *
272                 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
273
274         /*
275          * we want to account for 1 more bitmap than what we have so we can make
276          * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
277          * we add more bitmaps.
278          */
279         bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
280
281         if (bitmap_bytes >= max_bytes) {
282                 block_group->extents_thresh = 0;
283                 return;
284         }
285
286         /*
287          * we want the extent entry threshold to always be at most 1/2 the maxw
288          * bytes we can have, or whatever is less than that.
289          */
290         extent_bytes = max_bytes - bitmap_bytes;
291         extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
292
293         block_group->extents_thresh =
294                 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
295 }
296
297 static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
298                               struct btrfs_free_space *info, u64 offset,
299                               u64 bytes)
300 {
301         unsigned long start, end;
302         unsigned long i;
303
304         start = offset_to_bit(info->offset, block_group->sectorsize, offset);
305         end = start + bytes_to_bits(bytes, block_group->sectorsize);
306         BUG_ON(end > BITS_PER_BITMAP);
307
308         for (i = start; i < end; i++)
309                 clear_bit(i, info->bitmap);
310
311         info->bytes -= bytes;
312         block_group->free_space -= bytes;
313 }
314
315 static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
316                             struct btrfs_free_space *info, u64 offset,
317                             u64 bytes)
318 {
319         unsigned long start, end;
320         unsigned long i;
321
322         start = offset_to_bit(info->offset, block_group->sectorsize, offset);
323         end = start + bytes_to_bits(bytes, block_group->sectorsize);
324         BUG_ON(end > BITS_PER_BITMAP);
325
326         for (i = start; i < end; i++)
327                 set_bit(i, info->bitmap);
328
329         info->bytes += bytes;
330         block_group->free_space += bytes;
331 }
332
333 static int search_bitmap(struct btrfs_block_group_cache *block_group,
334                          struct btrfs_free_space *bitmap_info, u64 *offset,
335                          u64 *bytes)
336 {
337         unsigned long found_bits = 0;
338         unsigned long bits, i;
339         unsigned long next_zero;
340
341         i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
342                           max_t(u64, *offset, bitmap_info->offset));
343         bits = bytes_to_bits(*bytes, block_group->sectorsize);
344
345         for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
346              i < BITS_PER_BITMAP;
347              i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
348                 next_zero = find_next_zero_bit(bitmap_info->bitmap,
349                                                BITS_PER_BITMAP, i);
350                 if ((next_zero - i) >= bits) {
351                         found_bits = next_zero - i;
352                         break;
353                 }
354                 i = next_zero;
355         }
356
357         if (found_bits) {
358                 *offset = (u64)(i * block_group->sectorsize) +
359                         bitmap_info->offset;
360                 *bytes = (u64)(found_bits) * block_group->sectorsize;
361                 return 0;
362         }
363
364         return -1;
365 }
366
367 static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
368                                                 *block_group, u64 *offset,
369                                                 u64 *bytes, int debug)
370 {
371         struct btrfs_free_space *entry;
372         struct rb_node *node;
373         int ret;
374
375         if (!block_group->free_space_offset.rb_node)
376                 return NULL;
377
378         entry = tree_search_offset(block_group,
379                                    offset_to_bitmap(block_group, *offset),
380                                    0, 1);
381         if (!entry)
382                 return NULL;
383
384         for (node = &entry->offset_index; node; node = rb_next(node)) {
385                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
386                 if (entry->bytes < *bytes)
387                         continue;
388
389                 if (entry->bitmap) {
390                         ret = search_bitmap(block_group, entry, offset, bytes);
391                         if (!ret)
392                                 return entry;
393                         continue;
394                 }
395
396                 *offset = entry->offset;
397                 *bytes = entry->bytes;
398                 return entry;
399         }
400
401         return NULL;
402 }
403
404 static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
405                            struct btrfs_free_space *info, u64 offset)
406 {
407         u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
408         int max_bitmaps = (int)div64_u64(block_group->key.offset +
409                                          bytes_per_bg - 1, bytes_per_bg);
410         BUG_ON(block_group->total_bitmaps >= max_bitmaps);
411
412         info->offset = offset_to_bitmap(block_group, offset);
413         info->bytes = 0;
414         link_free_space(block_group, info);
415         block_group->total_bitmaps++;
416
417         recalculate_thresholds(block_group);
418 }
419
420 static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
421                               struct btrfs_free_space *bitmap_info,
422                               u64 *offset, u64 *bytes)
423 {
424         u64 end;
425         u64 search_start, search_bytes;
426         int ret;
427
428 again:
429         end = bitmap_info->offset +
430                 (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
431
432         /*
433          * XXX - this can go away after a few releases.
434          *
435          * since the only user of btrfs_remove_free_space is the tree logging
436          * stuff, and the only way to test that is under crash conditions, we
437          * want to have this debug stuff here just in case somethings not
438          * working.  Search the bitmap for the space we are trying to use to
439          * make sure its actually there.  If its not there then we need to stop
440          * because something has gone wrong.
441          */
442         search_start = *offset;
443         search_bytes = *bytes;
444         ret = search_bitmap(block_group, bitmap_info, &search_start,
445                             &search_bytes);
446         BUG_ON(ret < 0 || search_start != *offset);
447
448         if (*offset > bitmap_info->offset && *offset + *bytes > end) {
449                 bitmap_clear_bits(block_group, bitmap_info, *offset,
450                                   end - *offset + 1);
451                 *bytes -= end - *offset + 1;
452                 *offset = end + 1;
453         } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
454                 bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
455                 *bytes = 0;
456         }
457
458         if (*bytes) {
459                 struct rb_node *next = rb_next(&bitmap_info->offset_index);
460                 if (!bitmap_info->bytes) {
461                         unlink_free_space(block_group, bitmap_info);
462                         kfree(bitmap_info->bitmap);
463                         kfree(bitmap_info);
464                         block_group->total_bitmaps--;
465                         recalculate_thresholds(block_group);
466                 }
467
468                 /*
469                  * no entry after this bitmap, but we still have bytes to
470                  * remove, so something has gone wrong.
471                  */
472                 if (!next)
473                         return -EINVAL;
474
475                 bitmap_info = rb_entry(next, struct btrfs_free_space,
476                                        offset_index);
477
478                 /*
479                  * if the next entry isn't a bitmap we need to return to let the
480                  * extent stuff do its work.
481                  */
482                 if (!bitmap_info->bitmap)
483                         return -EAGAIN;
484
485                 /*
486                  * Ok the next item is a bitmap, but it may not actually hold
487                  * the information for the rest of this free space stuff, so
488                  * look for it, and if we don't find it return so we can try
489                  * everything over again.
490                  */
491                 search_start = *offset;
492                 search_bytes = *bytes;
493                 ret = search_bitmap(block_group, bitmap_info, &search_start,
494                                     &search_bytes);
495                 if (ret < 0 || search_start != *offset)
496                         return -EAGAIN;
497
498                 goto again;
499         } else if (!bitmap_info->bytes) {
500                 unlink_free_space(block_group, bitmap_info);
501                 kfree(bitmap_info->bitmap);
502                 kfree(bitmap_info);
503                 block_group->total_bitmaps--;
504                 recalculate_thresholds(block_group);
505         }
506
507         return 0;
508 }
509
510 static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
511                               struct btrfs_free_space *info)
512 {
513         struct btrfs_free_space *bitmap_info;
514         int added = 0;
515         u64 bytes, offset, end;
516         int ret;
517
518         /*
519          * If we are below the extents threshold then we can add this as an
520          * extent, and don't have to deal with the bitmap
521          */
522         if (block_group->free_extents < block_group->extents_thresh &&
523             info->bytes > block_group->sectorsize * 4)
524                 return 0;
525
526         /*
527          * some block groups are so tiny they can't be enveloped by a bitmap, so
528          * don't even bother to create a bitmap for this
529          */
530         if (BITS_PER_BITMAP * block_group->sectorsize >
531             block_group->key.offset)
532                 return 0;
533
534         bytes = info->bytes;
535         offset = info->offset;
536
537 again:
538         bitmap_info = tree_search_offset(block_group,
539                                          offset_to_bitmap(block_group, offset),
540                                          1, 0);
541         if (!bitmap_info) {
542                 BUG_ON(added);
543                 goto new_bitmap;
544         }
545
546         end = bitmap_info->offset +
547                 (u64)(BITS_PER_BITMAP * block_group->sectorsize);
548
549         if (offset >= bitmap_info->offset && offset + bytes > end) {
550                 bitmap_set_bits(block_group, bitmap_info, offset,
551                                 end - offset);
552                 bytes -= end - offset;
553                 offset = end;
554                 added = 0;
555         } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
556                 bitmap_set_bits(block_group, bitmap_info, offset, bytes);
557                 bytes = 0;
558         } else {
559                 BUG();
560         }
561
562         if (!bytes) {
563                 ret = 1;
564                 goto out;
565         } else
566                 goto again;
567
568 new_bitmap:
569         if (info && info->bitmap) {
570                 add_new_bitmap(block_group, info, offset);
571                 added = 1;
572                 info = NULL;
573                 goto again;
574         } else {
575                 spin_unlock(&block_group->tree_lock);
576
577                 /* no pre-allocated info, allocate a new one */
578                 if (!info) {
579                         info = kzalloc(sizeof(struct btrfs_free_space),
580                                        GFP_NOFS);
581                         if (!info) {
582                                 spin_lock(&block_group->tree_lock);
583                                 ret = -ENOMEM;
584                                 goto out;
585                         }
586                 }
587
588                 /* allocate the bitmap */
589                 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
590                 spin_lock(&block_group->tree_lock);
591                 if (!info->bitmap) {
592                         ret = -ENOMEM;
593                         goto out;
594                 }
595                 goto again;
596         }
597
598 out:
599         if (info) {
600                 if (info->bitmap)
601                         kfree(info->bitmap);
602                 kfree(info);
603         }
604
605         return ret;
606 }
607
608 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
609                          u64 offset, u64 bytes)
610 {
611         struct btrfs_free_space *right_info = NULL;
612         struct btrfs_free_space *left_info = NULL;
613         struct btrfs_free_space *info = NULL;
614         int ret = 0;
615
616         info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
617         if (!info)
618                 return -ENOMEM;
619
620         info->offset = offset;
621         info->bytes = bytes;
622
623         spin_lock(&block_group->tree_lock);
624
625         /*
626          * first we want to see if there is free space adjacent to the range we
627          * are adding, if there is remove that struct and add a new one to
628          * cover the entire range
629          */
630         right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
631         if (right_info && rb_prev(&right_info->offset_index))
632                 left_info = rb_entry(rb_prev(&right_info->offset_index),
633                                      struct btrfs_free_space, offset_index);
634         else
635                 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
636
637         /*
638          * If there was no extent directly to the left or right of this new
639          * extent then we know we're going to have to allocate a new extent, so
640          * before we do that see if we need to drop this into a bitmap
641          */
642         if ((!left_info || left_info->bitmap) &&
643             (!right_info || right_info->bitmap)) {
644                 ret = insert_into_bitmap(block_group, info);
645
646                 if (ret < 0) {
647                         goto out;
648                 } else if (ret) {
649                         ret = 0;
650                         goto out;
651                 }
652         }
653
654         if (right_info && !right_info->bitmap) {
655                 unlink_free_space(block_group, right_info);
656                 info->bytes += right_info->bytes;
657                 kfree(right_info);
658         }
659
660         if (left_info && !left_info->bitmap &&
661             left_info->offset + left_info->bytes == offset) {
662                 unlink_free_space(block_group, left_info);
663                 info->offset = left_info->offset;
664                 info->bytes += left_info->bytes;
665                 kfree(left_info);
666         }
667
668         ret = link_free_space(block_group, info);
669         if (ret)
670                 kfree(info);
671 out:
672         spin_unlock(&block_group->tree_lock);
673
674         if (ret) {
675                 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
676                 BUG_ON(ret == -EEXIST);
677         }
678
679         return ret;
680 }
681
682 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
683                             u64 offset, u64 bytes)
684 {
685         struct btrfs_free_space *info;
686         struct btrfs_free_space *next_info = NULL;
687         int ret = 0;
688
689         spin_lock(&block_group->tree_lock);
690
691 again:
692         info = tree_search_offset(block_group, offset, 0, 0);
693         if (!info) {
694                 /*
695                  * oops didn't find an extent that matched the space we wanted
696                  * to remove, look for a bitmap instead
697                  */
698                 info = tree_search_offset(block_group,
699                                           offset_to_bitmap(block_group, offset),
700                                           1, 0);
701                 if (!info) {
702                         WARN_ON(1);
703                         goto out_lock;
704                 }
705         }
706
707         if (info->bytes < bytes && rb_next(&info->offset_index)) {
708                 u64 end;
709                 next_info = rb_entry(rb_next(&info->offset_index),
710                                              struct btrfs_free_space,
711                                              offset_index);
712
713                 if (next_info->bitmap)
714                         end = next_info->offset + BITS_PER_BITMAP *
715                                 block_group->sectorsize - 1;
716                 else
717                         end = next_info->offset + next_info->bytes;
718
719                 if (next_info->bytes < bytes ||
720                     next_info->offset > offset || offset > end) {
721                         printk(KERN_CRIT "Found free space at %llu, size %llu,"
722                               " trying to use %llu\n",
723                               (unsigned long long)info->offset,
724                               (unsigned long long)info->bytes,
725                               (unsigned long long)bytes);
726                         WARN_ON(1);
727                         ret = -EINVAL;
728                         goto out_lock;
729                 }
730
731                 info = next_info;
732         }
733
734         if (info->bytes == bytes) {
735                 unlink_free_space(block_group, info);
736                 if (info->bitmap) {
737                         kfree(info->bitmap);
738                         block_group->total_bitmaps--;
739                 }
740                 kfree(info);
741                 goto out_lock;
742         }
743
744         if (!info->bitmap && info->offset == offset) {
745                 unlink_free_space(block_group, info);
746                 info->offset += bytes;
747                 info->bytes -= bytes;
748                 link_free_space(block_group, info);
749                 goto out_lock;
750         }
751
752         if (!info->bitmap && info->offset <= offset &&
753             info->offset + info->bytes >= offset + bytes) {
754                 u64 old_start = info->offset;
755                 /*
756                  * we're freeing space in the middle of the info,
757                  * this can happen during tree log replay
758                  *
759                  * first unlink the old info and then
760                  * insert it again after the hole we're creating
761                  */
762                 unlink_free_space(block_group, info);
763                 if (offset + bytes < info->offset + info->bytes) {
764                         u64 old_end = info->offset + info->bytes;
765
766                         info->offset = offset + bytes;
767                         info->bytes = old_end - info->offset;
768                         ret = link_free_space(block_group, info);
769                         WARN_ON(ret);
770                         if (ret)
771                                 goto out_lock;
772                 } else {
773                         /* the hole we're creating ends at the end
774                          * of the info struct, just free the info
775                          */
776                         kfree(info);
777                 }
778                 spin_unlock(&block_group->tree_lock);
779
780                 /* step two, insert a new info struct to cover
781                  * anything before the hole
782                  */
783                 ret = btrfs_add_free_space(block_group, old_start,
784                                            offset - old_start);
785                 WARN_ON(ret);
786                 goto out;
787         }
788
789         ret = remove_from_bitmap(block_group, info, &offset, &bytes);
790         if (ret == -EAGAIN)
791                 goto again;
792         BUG_ON(ret);
793 out_lock:
794         spin_unlock(&block_group->tree_lock);
795 out:
796         return ret;
797 }
798
799 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
800                            u64 bytes)
801 {
802         struct btrfs_free_space *info;
803         struct rb_node *n;
804         int count = 0;
805
806         for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
807                 info = rb_entry(n, struct btrfs_free_space, offset_index);
808                 if (info->bytes >= bytes)
809                         count++;
810                 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
811                        (unsigned long long)info->offset,
812                        (unsigned long long)info->bytes,
813                        (info->bitmap) ? "yes" : "no");
814         }
815         printk(KERN_INFO "block group has cluster?: %s\n",
816                list_empty(&block_group->cluster_list) ? "no" : "yes");
817         printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
818                "\n", count);
819 }
820
821 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
822 {
823         struct btrfs_free_space *info;
824         struct rb_node *n;
825         u64 ret = 0;
826
827         for (n = rb_first(&block_group->free_space_offset); n;
828              n = rb_next(n)) {
829                 info = rb_entry(n, struct btrfs_free_space, offset_index);
830                 ret += info->bytes;
831         }
832
833         return ret;
834 }
835
836 /*
837  * for a given cluster, put all of its extents back into the free
838  * space cache.  If the block group passed doesn't match the block group
839  * pointed to by the cluster, someone else raced in and freed the
840  * cluster already.  In that case, we just return without changing anything
841  */
842 static int
843 __btrfs_return_cluster_to_free_space(
844                              struct btrfs_block_group_cache *block_group,
845                              struct btrfs_free_cluster *cluster)
846 {
847         struct btrfs_free_space *entry;
848         struct rb_node *node;
849         bool bitmap;
850
851         spin_lock(&cluster->lock);
852         if (cluster->block_group != block_group)
853                 goto out;
854
855         bitmap = cluster->points_to_bitmap;
856         cluster->block_group = NULL;
857         cluster->window_start = 0;
858         list_del_init(&cluster->block_group_list);
859         cluster->points_to_bitmap = false;
860
861         if (bitmap)
862                 goto out;
863
864         node = rb_first(&cluster->root);
865         while (node) {
866                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
867                 node = rb_next(&entry->offset_index);
868                 rb_erase(&entry->offset_index, &cluster->root);
869                 BUG_ON(entry->bitmap);
870                 tree_insert_offset(&block_group->free_space_offset,
871                                    entry->offset, &entry->offset_index, 0);
872         }
873         cluster->root = RB_ROOT;
874
875 out:
876         spin_unlock(&cluster->lock);
877         btrfs_put_block_group(block_group);
878         return 0;
879 }
880
881 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
882 {
883         struct btrfs_free_space *info;
884         struct rb_node *node;
885         struct btrfs_free_cluster *cluster;
886         struct list_head *head;
887
888         spin_lock(&block_group->tree_lock);
889         while ((head = block_group->cluster_list.next) !=
890                &block_group->cluster_list) {
891                 cluster = list_entry(head, struct btrfs_free_cluster,
892                                      block_group_list);
893
894                 WARN_ON(cluster->block_group != block_group);
895                 __btrfs_return_cluster_to_free_space(block_group, cluster);
896                 if (need_resched()) {
897                         spin_unlock(&block_group->tree_lock);
898                         cond_resched();
899                         spin_lock(&block_group->tree_lock);
900                 }
901         }
902
903         while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
904                 info = rb_entry(node, struct btrfs_free_space, offset_index);
905                 unlink_free_space(block_group, info);
906                 if (info->bitmap)
907                         kfree(info->bitmap);
908                 kfree(info);
909                 if (need_resched()) {
910                         spin_unlock(&block_group->tree_lock);
911                         cond_resched();
912                         spin_lock(&block_group->tree_lock);
913                 }
914         }
915
916         spin_unlock(&block_group->tree_lock);
917 }
918
919 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
920                                u64 offset, u64 bytes, u64 empty_size)
921 {
922         struct btrfs_free_space *entry = NULL;
923         u64 bytes_search = bytes + empty_size;
924         u64 ret = 0;
925
926         spin_lock(&block_group->tree_lock);
927         entry = find_free_space(block_group, &offset, &bytes_search, 0);
928         if (!entry)
929                 goto out;
930
931         ret = offset;
932         if (entry->bitmap) {
933                 bitmap_clear_bits(block_group, entry, offset, bytes);
934                 if (!entry->bytes) {
935                         unlink_free_space(block_group, entry);
936                         kfree(entry->bitmap);
937                         kfree(entry);
938                         block_group->total_bitmaps--;
939                         recalculate_thresholds(block_group);
940                 }
941         } else {
942                 unlink_free_space(block_group, entry);
943                 entry->offset += bytes;
944                 entry->bytes -= bytes;
945                 if (!entry->bytes)
946                         kfree(entry);
947                 else
948                         link_free_space(block_group, entry);
949         }
950
951 out:
952         spin_unlock(&block_group->tree_lock);
953
954         return ret;
955 }
956
957 /*
958  * given a cluster, put all of its extents back into the free space
959  * cache.  If a block group is passed, this function will only free
960  * a cluster that belongs to the passed block group.
961  *
962  * Otherwise, it'll get a reference on the block group pointed to by the
963  * cluster and remove the cluster from it.
964  */
965 int btrfs_return_cluster_to_free_space(
966                                struct btrfs_block_group_cache *block_group,
967                                struct btrfs_free_cluster *cluster)
968 {
969         int ret;
970
971         /* first, get a safe pointer to the block group */
972         spin_lock(&cluster->lock);
973         if (!block_group) {
974                 block_group = cluster->block_group;
975                 if (!block_group) {
976                         spin_unlock(&cluster->lock);
977                         return 0;
978                 }
979         } else if (cluster->block_group != block_group) {
980                 /* someone else has already freed it don't redo their work */
981                 spin_unlock(&cluster->lock);
982                 return 0;
983         }
984         atomic_inc(&block_group->count);
985         spin_unlock(&cluster->lock);
986
987         /* now return any extents the cluster had on it */
988         spin_lock(&block_group->tree_lock);
989         ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
990         spin_unlock(&block_group->tree_lock);
991
992         /* finally drop our ref */
993         btrfs_put_block_group(block_group);
994         return ret;
995 }
996
997 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
998                                    struct btrfs_free_cluster *cluster,
999                                    u64 bytes, u64 min_start)
1000 {
1001         struct btrfs_free_space *entry;
1002         int err;
1003         u64 search_start = cluster->window_start;
1004         u64 search_bytes = bytes;
1005         u64 ret = 0;
1006
1007         spin_lock(&block_group->tree_lock);
1008         spin_lock(&cluster->lock);
1009
1010         if (!cluster->points_to_bitmap)
1011                 goto out;
1012
1013         if (cluster->block_group != block_group)
1014                 goto out;
1015
1016         /*
1017          * search_start is the beginning of the bitmap, but at some point it may
1018          * be a good idea to point to the actual start of the free area in the
1019          * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
1020          * to 1 to make sure we get the bitmap entry
1021          */
1022         entry = tree_search_offset(block_group,
1023                                    offset_to_bitmap(block_group, search_start),
1024                                    1, 0);
1025         if (!entry || !entry->bitmap)
1026                 goto out;
1027
1028         search_start = min_start;
1029         search_bytes = bytes;
1030
1031         err = search_bitmap(block_group, entry, &search_start,
1032                             &search_bytes);
1033         if (err)
1034                 goto out;
1035
1036         ret = search_start;
1037         bitmap_clear_bits(block_group, entry, ret, bytes);
1038 out:
1039         spin_unlock(&cluster->lock);
1040         spin_unlock(&block_group->tree_lock);
1041
1042         return ret;
1043 }
1044
1045 /*
1046  * given a cluster, try to allocate 'bytes' from it, returns 0
1047  * if it couldn't find anything suitably large, or a logical disk offset
1048  * if things worked out
1049  */
1050 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1051                              struct btrfs_free_cluster *cluster, u64 bytes,
1052                              u64 min_start)
1053 {
1054         struct btrfs_free_space *entry = NULL;
1055         struct rb_node *node;
1056         u64 ret = 0;
1057
1058         if (cluster->points_to_bitmap)
1059                 return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
1060                                                min_start);
1061
1062         spin_lock(&cluster->lock);
1063         if (bytes > cluster->max_size)
1064                 goto out;
1065
1066         if (cluster->block_group != block_group)
1067                 goto out;
1068
1069         node = rb_first(&cluster->root);
1070         if (!node)
1071                 goto out;
1072
1073         entry = rb_entry(node, struct btrfs_free_space, offset_index);
1074
1075         while(1) {
1076                 if (entry->bytes < bytes || entry->offset < min_start) {
1077                         struct rb_node *node;
1078
1079                         node = rb_next(&entry->offset_index);
1080                         if (!node)
1081                                 break;
1082                         entry = rb_entry(node, struct btrfs_free_space,
1083                                          offset_index);
1084                         continue;
1085                 }
1086                 ret = entry->offset;
1087
1088                 entry->offset += bytes;
1089                 entry->bytes -= bytes;
1090
1091                 if (entry->bytes == 0) {
1092                         rb_erase(&entry->offset_index, &cluster->root);
1093                         kfree(entry);
1094                 }
1095                 break;
1096         }
1097 out:
1098         spin_unlock(&cluster->lock);
1099
1100         return ret;
1101 }
1102
1103 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1104                                 struct btrfs_free_space *entry,
1105                                 struct btrfs_free_cluster *cluster,
1106                                 u64 offset, u64 bytes, u64 min_bytes)
1107 {
1108         unsigned long next_zero;
1109         unsigned long i;
1110         unsigned long search_bits;
1111         unsigned long total_bits;
1112         unsigned long found_bits;
1113         unsigned long start = 0;
1114         unsigned long total_found = 0;
1115         bool found = false;
1116
1117         i = offset_to_bit(entry->offset, block_group->sectorsize,
1118                           max_t(u64, offset, entry->offset));
1119         search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1120         total_bits = bytes_to_bits(bytes, block_group->sectorsize);
1121
1122 again:
1123         found_bits = 0;
1124         for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
1125              i < BITS_PER_BITMAP;
1126              i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
1127                 next_zero = find_next_zero_bit(entry->bitmap,
1128                                                BITS_PER_BITMAP, i);
1129                 if (next_zero - i >= search_bits) {
1130                         found_bits = next_zero - i;
1131                         break;
1132                 }
1133                 i = next_zero;
1134         }
1135
1136         if (!found_bits)
1137                 return -1;
1138
1139         if (!found) {
1140                 start = i;
1141                 found = true;
1142         }
1143
1144         total_found += found_bits;
1145
1146         if (cluster->max_size < found_bits * block_group->sectorsize)
1147                 cluster->max_size = found_bits * block_group->sectorsize;
1148
1149         if (total_found < total_bits) {
1150                 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
1151                 if (i - start > total_bits * 2) {
1152                         total_found = 0;
1153                         cluster->max_size = 0;
1154                         found = false;
1155                 }
1156                 goto again;
1157         }
1158
1159         cluster->window_start = start * block_group->sectorsize +
1160                 entry->offset;
1161         cluster->points_to_bitmap = true;
1162
1163         return 0;
1164 }
1165
1166 /*
1167  * here we try to find a cluster of blocks in a block group.  The goal
1168  * is to find at least bytes free and up to empty_size + bytes free.
1169  * We might not find them all in one contiguous area.
1170  *
1171  * returns zero and sets up cluster if things worked out, otherwise
1172  * it returns -enospc
1173  */
1174 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
1175                              struct btrfs_root *root,
1176                              struct btrfs_block_group_cache *block_group,
1177                              struct btrfs_free_cluster *cluster,
1178                              u64 offset, u64 bytes, u64 empty_size)
1179 {
1180         struct btrfs_free_space *entry = NULL;
1181         struct rb_node *node;
1182         struct btrfs_free_space *next;
1183         struct btrfs_free_space *last = NULL;
1184         u64 min_bytes;
1185         u64 window_start;
1186         u64 window_free;
1187         u64 max_extent = 0;
1188         bool found_bitmap = false;
1189         int ret;
1190
1191         /* for metadata, allow allocates with more holes */
1192         if (btrfs_test_opt(root, SSD_SPREAD)) {
1193                 min_bytes = bytes + empty_size;
1194         } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
1195                 /*
1196                  * we want to do larger allocations when we are
1197                  * flushing out the delayed refs, it helps prevent
1198                  * making more work as we go along.
1199                  */
1200                 if (trans->transaction->delayed_refs.flushing)
1201                         min_bytes = max(bytes, (bytes + empty_size) >> 1);
1202                 else
1203                         min_bytes = max(bytes, (bytes + empty_size) >> 4);
1204         } else
1205                 min_bytes = max(bytes, (bytes + empty_size) >> 2);
1206
1207         spin_lock(&block_group->tree_lock);
1208         spin_lock(&cluster->lock);
1209
1210         /* someone already found a cluster, hooray */
1211         if (cluster->block_group) {
1212                 ret = 0;
1213                 goto out;
1214         }
1215 again:
1216         entry = tree_search_offset(block_group, offset, found_bitmap, 1);
1217         if (!entry) {
1218                 ret = -ENOSPC;
1219                 goto out;
1220         }
1221
1222         /*
1223          * If found_bitmap is true, we exhausted our search for extent entries,
1224          * and we just want to search all of the bitmaps that we can find, and
1225          * ignore any extent entries we find.
1226          */
1227         while (entry->bitmap || found_bitmap ||
1228                (!entry->bitmap && entry->bytes < min_bytes)) {
1229                 struct rb_node *node = rb_next(&entry->offset_index);
1230
1231                 if (entry->bitmap && entry->bytes > bytes + empty_size) {
1232                         ret = btrfs_bitmap_cluster(block_group, entry, cluster,
1233                                                    offset, bytes + empty_size,
1234                                                    min_bytes);
1235                         if (!ret)
1236                                 goto got_it;
1237                 }
1238
1239                 if (!node) {
1240                         ret = -ENOSPC;
1241                         goto out;
1242                 }
1243                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1244         }
1245
1246         /*
1247          * We already searched all the extent entries from the passed in offset
1248          * to the end and didn't find enough space for the cluster, and we also
1249          * didn't find any bitmaps that met our criteria, just go ahead and exit
1250          */
1251         if (found_bitmap) {
1252                 ret = -ENOSPC;
1253                 goto out;
1254         }
1255
1256         cluster->points_to_bitmap = false;
1257         window_start = entry->offset;
1258         window_free = entry->bytes;
1259         last = entry;
1260         max_extent = entry->bytes;
1261
1262         while (1) {
1263                 /* out window is just right, lets fill it */
1264                 if (window_free >= bytes + empty_size)
1265                         break;
1266
1267                 node = rb_next(&last->offset_index);
1268                 if (!node) {
1269                         if (found_bitmap)
1270                                 goto again;
1271                         ret = -ENOSPC;
1272                         goto out;
1273                 }
1274                 next = rb_entry(node, struct btrfs_free_space, offset_index);
1275
1276                 /*
1277                  * we found a bitmap, so if this search doesn't result in a
1278                  * cluster, we know to go and search again for the bitmaps and
1279                  * start looking for space there
1280                  */
1281                 if (next->bitmap) {
1282                         if (!found_bitmap)
1283                                 offset = next->offset;
1284                         found_bitmap = true;
1285                         last = next;
1286                         continue;
1287                 }
1288
1289                 /*
1290                  * we haven't filled the empty size and the window is
1291                  * very large.  reset and try again
1292                  */
1293                 if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
1294                     next->offset - window_start > (bytes + empty_size) * 2) {
1295                         entry = next;
1296                         window_start = entry->offset;
1297                         window_free = entry->bytes;
1298                         last = entry;
1299                         max_extent = entry->bytes;
1300                 } else {
1301                         last = next;
1302                         window_free += next->bytes;
1303                         if (entry->bytes > max_extent)
1304                                 max_extent = entry->bytes;
1305                 }
1306         }
1307
1308         cluster->window_start = entry->offset;
1309
1310         /*
1311          * now we've found our entries, pull them out of the free space
1312          * cache and put them into the cluster rbtree
1313          *
1314          * The cluster includes an rbtree, but only uses the offset index
1315          * of each free space cache entry.
1316          */
1317         while (1) {
1318                 node = rb_next(&entry->offset_index);
1319                 if (entry->bitmap && node) {
1320                         entry = rb_entry(node, struct btrfs_free_space,
1321                                          offset_index);
1322                         continue;
1323                 } else if (entry->bitmap && !node) {
1324                         break;
1325                 }
1326
1327                 rb_erase(&entry->offset_index, &block_group->free_space_offset);
1328                 ret = tree_insert_offset(&cluster->root, entry->offset,
1329                                          &entry->offset_index, 0);
1330                 BUG_ON(ret);
1331
1332                 if (!node || entry == last)
1333                         break;
1334
1335                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1336         }
1337
1338         cluster->max_size = max_extent;
1339 got_it:
1340         ret = 0;
1341         atomic_inc(&block_group->count);
1342         list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
1343         cluster->block_group = block_group;
1344 out:
1345         spin_unlock(&cluster->lock);
1346         spin_unlock(&block_group->tree_lock);
1347
1348         return ret;
1349 }
1350
1351 /*
1352  * simple code to zero out a cluster
1353  */
1354 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
1355 {
1356         spin_lock_init(&cluster->lock);
1357         spin_lock_init(&cluster->refill_lock);
1358         cluster->root = RB_ROOT;
1359         cluster->max_size = 0;
1360         cluster->points_to_bitmap = false;
1361         INIT_LIST_HEAD(&cluster->block_group_list);
1362         cluster->block_group = NULL;
1363 }
1364