2 * Copyright (C) 2008 Red Hat. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/pagemap.h>
20 #include <linux/sched.h>
21 #include <linux/slab.h>
22 #include <linux/math64.h>
24 #include "free-space-cache.h"
25 #include "transaction.h"
27 #include "extent_io.h"
29 #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
30 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
32 static void recalculate_thresholds(struct btrfs_block_group_cache
34 static int link_free_space(struct btrfs_block_group_cache *block_group,
35 struct btrfs_free_space *info);
37 struct inode *lookup_free_space_inode(struct btrfs_root *root,
38 struct btrfs_block_group_cache
39 *block_group, struct btrfs_path *path)
42 struct btrfs_key location;
43 struct btrfs_disk_key disk_key;
44 struct btrfs_free_space_header *header;
45 struct extent_buffer *leaf;
46 struct inode *inode = NULL;
49 spin_lock(&block_group->lock);
50 if (block_group->inode)
51 inode = igrab(block_group->inode);
52 spin_unlock(&block_group->lock);
56 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
57 key.offset = block_group->key.objectid;
60 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
64 btrfs_release_path(root, path);
65 return ERR_PTR(-ENOENT);
68 leaf = path->nodes[0];
69 header = btrfs_item_ptr(leaf, path->slots[0],
70 struct btrfs_free_space_header);
71 btrfs_free_space_key(leaf, header, &disk_key);
72 btrfs_disk_key_to_cpu(&location, &disk_key);
73 btrfs_release_path(root, path);
75 inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
77 return ERR_PTR(-ENOENT);
80 if (is_bad_inode(inode)) {
82 return ERR_PTR(-ENOENT);
85 inode->i_mapping->flags &= ~__GFP_FS;
87 spin_lock(&block_group->lock);
88 if (!root->fs_info->closing) {
89 block_group->inode = igrab(inode);
90 block_group->iref = 1;
92 spin_unlock(&block_group->lock);
97 int create_free_space_inode(struct btrfs_root *root,
98 struct btrfs_trans_handle *trans,
99 struct btrfs_block_group_cache *block_group,
100 struct btrfs_path *path)
102 struct btrfs_key key;
103 struct btrfs_disk_key disk_key;
104 struct btrfs_free_space_header *header;
105 struct btrfs_inode_item *inode_item;
106 struct extent_buffer *leaf;
110 ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
114 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
118 leaf = path->nodes[0];
119 inode_item = btrfs_item_ptr(leaf, path->slots[0],
120 struct btrfs_inode_item);
121 btrfs_item_key(leaf, &disk_key, path->slots[0]);
122 memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
123 sizeof(*inode_item));
124 btrfs_set_inode_generation(leaf, inode_item, trans->transid);
125 btrfs_set_inode_size(leaf, inode_item, 0);
126 btrfs_set_inode_nbytes(leaf, inode_item, 0);
127 btrfs_set_inode_uid(leaf, inode_item, 0);
128 btrfs_set_inode_gid(leaf, inode_item, 0);
129 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
130 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
131 BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
132 btrfs_set_inode_nlink(leaf, inode_item, 1);
133 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
134 btrfs_set_inode_block_group(leaf, inode_item,
135 block_group->key.objectid);
136 btrfs_mark_buffer_dirty(leaf);
137 btrfs_release_path(root, path);
139 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
140 key.offset = block_group->key.objectid;
143 ret = btrfs_insert_empty_item(trans, root, path, &key,
144 sizeof(struct btrfs_free_space_header));
146 btrfs_release_path(root, path);
149 leaf = path->nodes[0];
150 header = btrfs_item_ptr(leaf, path->slots[0],
151 struct btrfs_free_space_header);
152 memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
153 btrfs_set_free_space_key(leaf, header, &disk_key);
154 btrfs_mark_buffer_dirty(leaf);
155 btrfs_release_path(root, path);
160 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
161 struct btrfs_trans_handle *trans,
162 struct btrfs_path *path,
168 trans->block_rsv = root->orphan_block_rsv;
169 ret = btrfs_block_rsv_check(trans, root,
170 root->orphan_block_rsv,
175 oldsize = i_size_read(inode);
176 btrfs_i_size_write(inode, 0);
177 truncate_pagecache(inode, oldsize, 0);
180 * We don't need an orphan item because truncating the free space cache
181 * will never be split across transactions.
183 ret = btrfs_truncate_inode_items(trans, root, inode,
184 0, BTRFS_EXTENT_DATA_KEY);
190 return btrfs_update_inode(trans, root, inode);
193 static int readahead_cache(struct inode *inode)
195 struct file_ra_state *ra;
196 unsigned long last_index;
198 ra = kzalloc(sizeof(*ra), GFP_NOFS);
202 file_ra_state_init(ra, inode->i_mapping);
203 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
205 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
212 int load_free_space_cache(struct btrfs_fs_info *fs_info,
213 struct btrfs_block_group_cache *block_group)
215 struct btrfs_root *root = fs_info->tree_root;
217 struct btrfs_free_space_header *header;
218 struct extent_buffer *leaf;
220 struct btrfs_path *path;
221 u32 *checksums = NULL, *crc;
222 char *disk_crcs = NULL;
223 struct btrfs_key key;
224 struct list_head bitmaps;
228 u64 used = btrfs_block_group_used(&block_group->item);
229 u32 cur_crc = ~(u32)0;
231 unsigned long first_page_offset;
236 * If we're unmounting then just return, since this does a search on the
237 * normal root and not the commit root and we could deadlock.
240 if (fs_info->closing)
244 * If this block group has been marked to be cleared for one reason or
245 * another then we can't trust the on disk cache, so just return.
247 spin_lock(&block_group->lock);
248 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
249 spin_unlock(&block_group->lock);
252 spin_unlock(&block_group->lock);
254 INIT_LIST_HEAD(&bitmaps);
256 path = btrfs_alloc_path();
260 inode = lookup_free_space_inode(root, block_group, path);
262 btrfs_free_path(path);
266 /* Nothing in the space cache, goodbye */
267 if (!i_size_read(inode)) {
268 btrfs_free_path(path);
272 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
273 key.offset = block_group->key.objectid;
276 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
278 btrfs_free_path(path);
282 leaf = path->nodes[0];
283 header = btrfs_item_ptr(leaf, path->slots[0],
284 struct btrfs_free_space_header);
285 num_entries = btrfs_free_space_entries(leaf, header);
286 num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
287 generation = btrfs_free_space_generation(leaf, header);
288 btrfs_free_path(path);
290 if (BTRFS_I(inode)->generation != generation) {
291 printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
292 " not match free space cache generation (%llu) for "
293 "block group %llu\n",
294 (unsigned long long)BTRFS_I(inode)->generation,
295 (unsigned long long)generation,
296 (unsigned long long)block_group->key.objectid);
303 /* Setup everything for doing checksumming */
304 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
305 checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
308 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
309 disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
313 ret = readahead_cache(inode);
320 struct btrfs_free_space_entry *entry;
321 struct btrfs_free_space *e;
323 unsigned long offset = 0;
324 unsigned long start_offset = 0;
327 if (!num_entries && !num_bitmaps)
331 start_offset = first_page_offset;
332 offset = start_offset;
335 page = grab_cache_page(inode->i_mapping, index);
341 if (!PageUptodate(page)) {
342 btrfs_readpage(NULL, page);
344 if (!PageUptodate(page)) {
346 page_cache_release(page);
347 printk(KERN_ERR "btrfs: error reading free "
348 "space cache: %llu\n",
350 block_group->key.objectid);
359 memcpy(disk_crcs, addr, first_page_offset);
360 gen = addr + (sizeof(u32) * num_checksums);
361 if (*gen != BTRFS_I(inode)->generation) {
362 printk(KERN_ERR "btrfs: space cache generation"
363 " (%llu) does not match inode (%llu) "
364 "for block group %llu\n",
365 (unsigned long long)*gen,
367 BTRFS_I(inode)->generation,
369 block_group->key.objectid);
372 page_cache_release(page);
375 crc = (u32 *)disk_crcs;
377 entry = addr + start_offset;
379 /* First lets check our crc before we do anything fun */
381 cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
382 PAGE_CACHE_SIZE - start_offset);
383 btrfs_csum_final(cur_crc, (char *)&cur_crc);
384 if (cur_crc != *crc) {
385 printk(KERN_ERR "btrfs: crc mismatch for page %lu in "
386 "block group %llu\n", index,
387 (unsigned long long)block_group->key.objectid);
390 page_cache_release(page);
400 e = kmem_cache_zalloc(btrfs_free_space_cachep,
405 page_cache_release(page);
409 e->offset = le64_to_cpu(entry->offset);
410 e->bytes = le64_to_cpu(entry->bytes);
413 kmem_cache_free(btrfs_free_space_cachep, e);
415 page_cache_release(page);
419 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
420 spin_lock(&block_group->tree_lock);
421 ret = link_free_space(block_group, e);
422 spin_unlock(&block_group->tree_lock);
424 printk(KERN_ERR "Duplicate entries in "
425 "free space cache, dumping\n");
428 page_cache_release(page);
432 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
436 btrfs_free_space_cachep, e);
438 page_cache_release(page);
441 spin_lock(&block_group->tree_lock);
442 ret = link_free_space(block_group, e);
443 block_group->total_bitmaps++;
444 recalculate_thresholds(block_group);
445 spin_unlock(&block_group->tree_lock);
446 list_add_tail(&e->list, &bitmaps);
448 printk(KERN_ERR "Duplicate entries in "
449 "free space cache, dumping\n");
452 page_cache_release(page);
458 offset += sizeof(struct btrfs_free_space_entry);
459 if (offset + sizeof(struct btrfs_free_space_entry) >=
466 * We read an entry out of this page, we need to move on to the
475 * We add the bitmaps at the end of the entries in order that
476 * the bitmap entries are added to the cache.
478 e = list_entry(bitmaps.next, struct btrfs_free_space, list);
479 list_del_init(&e->list);
480 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
485 page_cache_release(page);
489 spin_lock(&block_group->tree_lock);
490 if (block_group->free_space != (block_group->key.offset - used -
491 block_group->bytes_super)) {
492 spin_unlock(&block_group->tree_lock);
493 printk(KERN_ERR "block group %llu has an wrong amount of free "
494 "space\n", block_group->key.objectid);
498 spin_unlock(&block_group->tree_lock);
508 /* This cache is bogus, make sure it gets cleared */
509 spin_lock(&block_group->lock);
510 block_group->disk_cache_state = BTRFS_DC_CLEAR;
511 spin_unlock(&block_group->lock);
512 btrfs_remove_free_space_cache(block_group);
516 int btrfs_write_out_cache(struct btrfs_root *root,
517 struct btrfs_trans_handle *trans,
518 struct btrfs_block_group_cache *block_group,
519 struct btrfs_path *path)
521 struct btrfs_free_space_header *header;
522 struct extent_buffer *leaf;
524 struct rb_node *node;
525 struct list_head *pos, *n;
528 struct extent_state *cached_state = NULL;
529 struct btrfs_free_cluster *cluster = NULL;
530 struct extent_io_tree *unpin = NULL;
531 struct list_head bitmap_list;
532 struct btrfs_key key;
535 u32 *crc, *checksums;
536 unsigned long first_page_offset;
537 int index = 0, num_pages = 0;
541 bool next_page = false;
542 bool out_of_space = false;
544 root = root->fs_info->tree_root;
546 INIT_LIST_HEAD(&bitmap_list);
548 spin_lock(&block_group->lock);
549 if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
550 spin_unlock(&block_group->lock);
553 spin_unlock(&block_group->lock);
555 inode = lookup_free_space_inode(root, block_group, path);
559 if (!i_size_read(inode)) {
564 node = rb_first(&block_group->free_space_offset);
570 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
572 filemap_write_and_wait(inode->i_mapping);
573 btrfs_wait_ordered_range(inode, inode->i_size &
574 ~(root->sectorsize - 1), (u64)-1);
576 /* We need a checksum per page. */
577 crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
583 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
590 /* Since the first page has all of our checksums and our generation we
591 * need to calculate the offset into the page that we can start writing
594 first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
596 /* Get the cluster for this block_group if it exists */
597 if (!list_empty(&block_group->cluster_list))
598 cluster = list_entry(block_group->cluster_list.next,
599 struct btrfs_free_cluster,
603 * We shouldn't have switched the pinned extents yet so this is the
606 unpin = root->fs_info->pinned_extents;
609 * Lock all pages first so we can lock the extent safely.
611 * NOTE: Because we hold the ref the entire time we're going to write to
612 * the page find_get_page should never fail, so we don't do a check
613 * after find_get_page at this point. Just putting this here so people
614 * know and don't freak out.
616 while (index < num_pages) {
617 page = grab_cache_page(inode->i_mapping, index);
621 for (i = 0; i < num_pages; i++) {
622 unlock_page(pages[i]);
623 page_cache_release(pages[i]);
632 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
633 0, &cached_state, GFP_NOFS);
636 * When searching for pinned extents, we need to start at our start
639 start = block_group->key.objectid;
641 /* Write out the extent entries */
643 struct btrfs_free_space_entry *entry;
645 unsigned long offset = 0;
646 unsigned long start_offset = 0;
651 start_offset = first_page_offset;
652 offset = start_offset;
655 if (index >= num_pages) {
663 entry = addr + start_offset;
665 memset(addr, 0, PAGE_CACHE_SIZE);
666 while (node && !next_page) {
667 struct btrfs_free_space *e;
669 e = rb_entry(node, struct btrfs_free_space, offset_index);
672 entry->offset = cpu_to_le64(e->offset);
673 entry->bytes = cpu_to_le64(e->bytes);
675 entry->type = BTRFS_FREE_SPACE_BITMAP;
676 list_add_tail(&e->list, &bitmap_list);
679 entry->type = BTRFS_FREE_SPACE_EXTENT;
681 node = rb_next(node);
682 if (!node && cluster) {
683 node = rb_first(&cluster->root);
686 offset += sizeof(struct btrfs_free_space_entry);
687 if (offset + sizeof(struct btrfs_free_space_entry) >=
694 * We want to add any pinned extents to our free space cache
695 * so we don't leak the space
697 while (!next_page && (start < block_group->key.objectid +
698 block_group->key.offset)) {
699 ret = find_first_extent_bit(unpin, start, &start, &end,
706 /* This pinned extent is out of our range */
707 if (start >= block_group->key.objectid +
708 block_group->key.offset)
711 len = block_group->key.objectid +
712 block_group->key.offset - start;
713 len = min(len, end + 1 - start);
716 entry->offset = cpu_to_le64(start);
717 entry->bytes = cpu_to_le64(len);
718 entry->type = BTRFS_FREE_SPACE_EXTENT;
721 offset += sizeof(struct btrfs_free_space_entry);
722 if (offset + sizeof(struct btrfs_free_space_entry) >=
728 *crc = btrfs_csum_data(root, addr + start_offset, *crc,
729 PAGE_CACHE_SIZE - start_offset);
732 btrfs_csum_final(*crc, (char *)crc);
735 bytes += PAGE_CACHE_SIZE;
738 } while (node || next_page);
740 /* Write out the bitmaps */
741 list_for_each_safe(pos, n, &bitmap_list) {
743 struct btrfs_free_space *entry =
744 list_entry(pos, struct btrfs_free_space, list);
746 if (index >= num_pages) {
753 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
755 *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
757 btrfs_csum_final(*crc, (char *)crc);
759 bytes += PAGE_CACHE_SIZE;
761 list_del_init(&entry->list);
766 btrfs_drop_pages(pages, num_pages);
767 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
768 i_size_read(inode) - 1, &cached_state,
774 /* Zero out the rest of the pages just to make sure */
775 while (index < num_pages) {
780 memset(addr, 0, PAGE_CACHE_SIZE);
782 bytes += PAGE_CACHE_SIZE;
786 /* Write the checksums and trans id to the first page */
794 memcpy(addr, checksums, sizeof(u32) * num_pages);
795 gen = addr + (sizeof(u32) * num_pages);
796 *gen = trans->transid;
800 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
801 bytes, &cached_state);
802 btrfs_drop_pages(pages, num_pages);
803 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
804 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
811 BTRFS_I(inode)->generation = trans->transid;
813 filemap_write_and_wait(inode->i_mapping);
815 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
816 key.offset = block_group->key.objectid;
819 ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
822 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
823 EXTENT_DIRTY | EXTENT_DELALLOC |
824 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
827 leaf = path->nodes[0];
829 struct btrfs_key found_key;
830 BUG_ON(!path->slots[0]);
832 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
833 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
834 found_key.offset != block_group->key.objectid) {
836 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
837 EXTENT_DIRTY | EXTENT_DELALLOC |
838 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
840 btrfs_release_path(root, path);
844 header = btrfs_item_ptr(leaf, path->slots[0],
845 struct btrfs_free_space_header);
846 btrfs_set_free_space_entries(leaf, header, entries);
847 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
848 btrfs_set_free_space_generation(leaf, header, trans->transid);
849 btrfs_mark_buffer_dirty(leaf);
850 btrfs_release_path(root, path);
856 invalidate_inode_pages2_range(inode->i_mapping, 0, index);
857 spin_lock(&block_group->lock);
858 block_group->disk_cache_state = BTRFS_DC_ERROR;
859 spin_unlock(&block_group->lock);
860 BTRFS_I(inode)->generation = 0;
864 btrfs_update_inode(trans, root, inode);
869 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
872 BUG_ON(offset < bitmap_start);
873 offset -= bitmap_start;
874 return (unsigned long)(div64_u64(offset, sectorsize));
877 static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
879 return (unsigned long)(div64_u64(bytes, sectorsize));
882 static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
886 u64 bytes_per_bitmap;
888 bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
889 bitmap_start = offset - block_group->key.objectid;
890 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
891 bitmap_start *= bytes_per_bitmap;
892 bitmap_start += block_group->key.objectid;
897 static int tree_insert_offset(struct rb_root *root, u64 offset,
898 struct rb_node *node, int bitmap)
900 struct rb_node **p = &root->rb_node;
901 struct rb_node *parent = NULL;
902 struct btrfs_free_space *info;
906 info = rb_entry(parent, struct btrfs_free_space, offset_index);
908 if (offset < info->offset) {
910 } else if (offset > info->offset) {
914 * we could have a bitmap entry and an extent entry
915 * share the same offset. If this is the case, we want
916 * the extent entry to always be found first if we do a
917 * linear search through the tree, since we want to have
918 * the quickest allocation time, and allocating from an
919 * extent is faster than allocating from a bitmap. So
920 * if we're inserting a bitmap and we find an entry at
921 * this offset, we want to go right, or after this entry
922 * logically. If we are inserting an extent and we've
923 * found a bitmap, we want to go left, or before
942 rb_link_node(node, parent, p);
943 rb_insert_color(node, root);
949 * searches the tree for the given offset.
951 * fuzzy - If this is set, then we are trying to make an allocation, and we just
952 * want a section that has at least bytes size and comes at or after the given
955 static struct btrfs_free_space *
956 tree_search_offset(struct btrfs_block_group_cache *block_group,
957 u64 offset, int bitmap_only, int fuzzy)
959 struct rb_node *n = block_group->free_space_offset.rb_node;
960 struct btrfs_free_space *entry, *prev = NULL;
962 /* find entry that is closest to the 'offset' */
969 entry = rb_entry(n, struct btrfs_free_space, offset_index);
972 if (offset < entry->offset)
974 else if (offset > entry->offset)
987 * bitmap entry and extent entry may share same offset,
988 * in that case, bitmap entry comes after extent entry.
993 entry = rb_entry(n, struct btrfs_free_space, offset_index);
994 if (entry->offset != offset)
997 WARN_ON(!entry->bitmap);
1000 if (entry->bitmap) {
1002 * if previous extent entry covers the offset,
1003 * we should return it instead of the bitmap entry
1005 n = &entry->offset_index;
1010 prev = rb_entry(n, struct btrfs_free_space,
1012 if (!prev->bitmap) {
1013 if (prev->offset + prev->bytes > offset)
1025 /* find last entry before the 'offset' */
1027 if (entry->offset > offset) {
1028 n = rb_prev(&entry->offset_index);
1030 entry = rb_entry(n, struct btrfs_free_space,
1032 BUG_ON(entry->offset > offset);
1041 if (entry->bitmap) {
1042 n = &entry->offset_index;
1047 prev = rb_entry(n, struct btrfs_free_space,
1049 if (!prev->bitmap) {
1050 if (prev->offset + prev->bytes > offset)
1055 if (entry->offset + BITS_PER_BITMAP *
1056 block_group->sectorsize > offset)
1058 } else if (entry->offset + entry->bytes > offset)
1065 if (entry->bitmap) {
1066 if (entry->offset + BITS_PER_BITMAP *
1067 block_group->sectorsize > offset)
1070 if (entry->offset + entry->bytes > offset)
1074 n = rb_next(&entry->offset_index);
1077 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1083 __unlink_free_space(struct btrfs_block_group_cache *block_group,
1084 struct btrfs_free_space *info)
1086 rb_erase(&info->offset_index, &block_group->free_space_offset);
1087 block_group->free_extents--;
1090 static void unlink_free_space(struct btrfs_block_group_cache *block_group,
1091 struct btrfs_free_space *info)
1093 __unlink_free_space(block_group, info);
1094 block_group->free_space -= info->bytes;
1097 static int link_free_space(struct btrfs_block_group_cache *block_group,
1098 struct btrfs_free_space *info)
1102 BUG_ON(!info->bitmap && !info->bytes);
1103 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
1104 &info->offset_index, (info->bitmap != NULL));
1108 block_group->free_space += info->bytes;
1109 block_group->free_extents++;
1113 static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
1118 u64 size = block_group->key.offset;
1121 * The goal is to keep the total amount of memory used per 1gb of space
1122 * at or below 32k, so we need to adjust how much memory we allow to be
1123 * used by extent based free space tracking
1125 if (size < 1024 * 1024 * 1024)
1126 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1128 max_bytes = MAX_CACHE_BYTES_PER_GIG *
1129 div64_u64(size, 1024 * 1024 * 1024);
1132 * we want to account for 1 more bitmap than what we have so we can make
1133 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1134 * we add more bitmaps.
1136 bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
1138 if (bitmap_bytes >= max_bytes) {
1139 block_group->extents_thresh = 0;
1144 * we want the extent entry threshold to always be at most 1/2 the maxw
1145 * bytes we can have, or whatever is less than that.
1147 extent_bytes = max_bytes - bitmap_bytes;
1148 extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
1150 block_group->extents_thresh =
1151 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
1154 static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
1155 struct btrfs_free_space *info, u64 offset,
1158 unsigned long start, end;
1161 start = offset_to_bit(info->offset, block_group->sectorsize, offset);
1162 end = start + bytes_to_bits(bytes, block_group->sectorsize);
1163 BUG_ON(end > BITS_PER_BITMAP);
1165 for (i = start; i < end; i++)
1166 clear_bit(i, info->bitmap);
1168 info->bytes -= bytes;
1169 block_group->free_space -= bytes;
1172 static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
1173 struct btrfs_free_space *info, u64 offset,
1176 unsigned long start, end;
1179 start = offset_to_bit(info->offset, block_group->sectorsize, offset);
1180 end = start + bytes_to_bits(bytes, block_group->sectorsize);
1181 BUG_ON(end > BITS_PER_BITMAP);
1183 for (i = start; i < end; i++)
1184 set_bit(i, info->bitmap);
1186 info->bytes += bytes;
1187 block_group->free_space += bytes;
1190 static int search_bitmap(struct btrfs_block_group_cache *block_group,
1191 struct btrfs_free_space *bitmap_info, u64 *offset,
1194 unsigned long found_bits = 0;
1195 unsigned long bits, i;
1196 unsigned long next_zero;
1198 i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
1199 max_t(u64, *offset, bitmap_info->offset));
1200 bits = bytes_to_bits(*bytes, block_group->sectorsize);
1202 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
1203 i < BITS_PER_BITMAP;
1204 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
1205 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1206 BITS_PER_BITMAP, i);
1207 if ((next_zero - i) >= bits) {
1208 found_bits = next_zero - i;
1215 *offset = (u64)(i * block_group->sectorsize) +
1216 bitmap_info->offset;
1217 *bytes = (u64)(found_bits) * block_group->sectorsize;
1224 static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
1225 *block_group, u64 *offset,
1226 u64 *bytes, int debug)
1228 struct btrfs_free_space *entry;
1229 struct rb_node *node;
1232 if (!block_group->free_space_offset.rb_node)
1235 entry = tree_search_offset(block_group,
1236 offset_to_bitmap(block_group, *offset),
1241 for (node = &entry->offset_index; node; node = rb_next(node)) {
1242 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1243 if (entry->bytes < *bytes)
1246 if (entry->bitmap) {
1247 ret = search_bitmap(block_group, entry, offset, bytes);
1253 *offset = entry->offset;
1254 *bytes = entry->bytes;
1261 static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
1262 struct btrfs_free_space *info, u64 offset)
1264 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
1265 int max_bitmaps = (int)div64_u64(block_group->key.offset +
1266 bytes_per_bg - 1, bytes_per_bg);
1267 BUG_ON(block_group->total_bitmaps >= max_bitmaps);
1269 info->offset = offset_to_bitmap(block_group, offset);
1271 link_free_space(block_group, info);
1272 block_group->total_bitmaps++;
1274 recalculate_thresholds(block_group);
1277 static void free_bitmap(struct btrfs_block_group_cache *block_group,
1278 struct btrfs_free_space *bitmap_info)
1280 unlink_free_space(block_group, bitmap_info);
1281 kfree(bitmap_info->bitmap);
1282 kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1283 block_group->total_bitmaps--;
1284 recalculate_thresholds(block_group);
1287 static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
1288 struct btrfs_free_space *bitmap_info,
1289 u64 *offset, u64 *bytes)
1292 u64 search_start, search_bytes;
1296 end = bitmap_info->offset +
1297 (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
1300 * XXX - this can go away after a few releases.
1302 * since the only user of btrfs_remove_free_space is the tree logging
1303 * stuff, and the only way to test that is under crash conditions, we
1304 * want to have this debug stuff here just in case somethings not
1305 * working. Search the bitmap for the space we are trying to use to
1306 * make sure its actually there. If its not there then we need to stop
1307 * because something has gone wrong.
1309 search_start = *offset;
1310 search_bytes = *bytes;
1311 search_bytes = min(search_bytes, end - search_start + 1);
1312 ret = search_bitmap(block_group, bitmap_info, &search_start,
1314 BUG_ON(ret < 0 || search_start != *offset);
1316 if (*offset > bitmap_info->offset && *offset + *bytes > end) {
1317 bitmap_clear_bits(block_group, bitmap_info, *offset,
1319 *bytes -= end - *offset + 1;
1321 } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
1322 bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
1327 struct rb_node *next = rb_next(&bitmap_info->offset_index);
1328 if (!bitmap_info->bytes)
1329 free_bitmap(block_group, bitmap_info);
1332 * no entry after this bitmap, but we still have bytes to
1333 * remove, so something has gone wrong.
1338 bitmap_info = rb_entry(next, struct btrfs_free_space,
1342 * if the next entry isn't a bitmap we need to return to let the
1343 * extent stuff do its work.
1345 if (!bitmap_info->bitmap)
1349 * Ok the next item is a bitmap, but it may not actually hold
1350 * the information for the rest of this free space stuff, so
1351 * look for it, and if we don't find it return so we can try
1352 * everything over again.
1354 search_start = *offset;
1355 search_bytes = *bytes;
1356 ret = search_bitmap(block_group, bitmap_info, &search_start,
1358 if (ret < 0 || search_start != *offset)
1362 } else if (!bitmap_info->bytes)
1363 free_bitmap(block_group, bitmap_info);
1368 static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
1369 struct btrfs_free_space *info)
1371 struct btrfs_free_space *bitmap_info;
1373 u64 bytes, offset, end;
1377 * If we are below the extents threshold then we can add this as an
1378 * extent, and don't have to deal with the bitmap
1380 if (block_group->free_extents < block_group->extents_thresh) {
1382 * If this block group has some small extents we don't want to
1383 * use up all of our free slots in the cache with them, we want
1384 * to reserve them to larger extents, however if we have plent
1385 * of cache left then go ahead an dadd them, no sense in adding
1386 * the overhead of a bitmap if we don't have to.
1388 if (info->bytes <= block_group->sectorsize * 4) {
1389 if (block_group->free_extents * 2 <=
1390 block_group->extents_thresh)
1398 * some block groups are so tiny they can't be enveloped by a bitmap, so
1399 * don't even bother to create a bitmap for this
1401 if (BITS_PER_BITMAP * block_group->sectorsize >
1402 block_group->key.offset)
1405 bytes = info->bytes;
1406 offset = info->offset;
1409 bitmap_info = tree_search_offset(block_group,
1410 offset_to_bitmap(block_group, offset),
1417 end = bitmap_info->offset +
1418 (u64)(BITS_PER_BITMAP * block_group->sectorsize);
1420 if (offset >= bitmap_info->offset && offset + bytes > end) {
1421 bitmap_set_bits(block_group, bitmap_info, offset,
1423 bytes -= end - offset;
1426 } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
1427 bitmap_set_bits(block_group, bitmap_info, offset, bytes);
1440 if (info && info->bitmap) {
1441 add_new_bitmap(block_group, info, offset);
1446 spin_unlock(&block_group->tree_lock);
1448 /* no pre-allocated info, allocate a new one */
1450 info = kmem_cache_zalloc(btrfs_free_space_cachep,
1453 spin_lock(&block_group->tree_lock);
1459 /* allocate the bitmap */
1460 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
1461 spin_lock(&block_group->tree_lock);
1462 if (!info->bitmap) {
1472 kfree(info->bitmap);
1473 kmem_cache_free(btrfs_free_space_cachep, info);
1479 bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
1480 struct btrfs_free_space *info, bool update_stat)
1482 struct btrfs_free_space *left_info;
1483 struct btrfs_free_space *right_info;
1484 bool merged = false;
1485 u64 offset = info->offset;
1486 u64 bytes = info->bytes;
1489 * first we want to see if there is free space adjacent to the range we
1490 * are adding, if there is remove that struct and add a new one to
1491 * cover the entire range
1493 right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
1494 if (right_info && rb_prev(&right_info->offset_index))
1495 left_info = rb_entry(rb_prev(&right_info->offset_index),
1496 struct btrfs_free_space, offset_index);
1498 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
1500 if (right_info && !right_info->bitmap) {
1502 unlink_free_space(block_group, right_info);
1504 __unlink_free_space(block_group, right_info);
1505 info->bytes += right_info->bytes;
1506 kmem_cache_free(btrfs_free_space_cachep, right_info);
1510 if (left_info && !left_info->bitmap &&
1511 left_info->offset + left_info->bytes == offset) {
1513 unlink_free_space(block_group, left_info);
1515 __unlink_free_space(block_group, left_info);
1516 info->offset = left_info->offset;
1517 info->bytes += left_info->bytes;
1518 kmem_cache_free(btrfs_free_space_cachep, left_info);
1525 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
1526 u64 offset, u64 bytes)
1528 struct btrfs_free_space *info;
1531 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
1535 info->offset = offset;
1536 info->bytes = bytes;
1538 spin_lock(&block_group->tree_lock);
1540 if (try_merge_free_space(block_group, info, true))
1544 * There was no extent directly to the left or right of this new
1545 * extent then we know we're going to have to allocate a new extent, so
1546 * before we do that see if we need to drop this into a bitmap
1548 ret = insert_into_bitmap(block_group, info);
1556 ret = link_free_space(block_group, info);
1558 kmem_cache_free(btrfs_free_space_cachep, info);
1560 spin_unlock(&block_group->tree_lock);
1563 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
1564 BUG_ON(ret == -EEXIST);
1570 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
1571 u64 offset, u64 bytes)
1573 struct btrfs_free_space *info;
1574 struct btrfs_free_space *next_info = NULL;
1577 spin_lock(&block_group->tree_lock);
1580 info = tree_search_offset(block_group, offset, 0, 0);
1583 * oops didn't find an extent that matched the space we wanted
1584 * to remove, look for a bitmap instead
1586 info = tree_search_offset(block_group,
1587 offset_to_bitmap(block_group, offset),
1595 if (info->bytes < bytes && rb_next(&info->offset_index)) {
1597 next_info = rb_entry(rb_next(&info->offset_index),
1598 struct btrfs_free_space,
1601 if (next_info->bitmap)
1602 end = next_info->offset + BITS_PER_BITMAP *
1603 block_group->sectorsize - 1;
1605 end = next_info->offset + next_info->bytes;
1607 if (next_info->bytes < bytes ||
1608 next_info->offset > offset || offset > end) {
1609 printk(KERN_CRIT "Found free space at %llu, size %llu,"
1610 " trying to use %llu\n",
1611 (unsigned long long)info->offset,
1612 (unsigned long long)info->bytes,
1613 (unsigned long long)bytes);
1622 if (info->bytes == bytes) {
1623 unlink_free_space(block_group, info);
1625 kfree(info->bitmap);
1626 block_group->total_bitmaps--;
1628 kmem_cache_free(btrfs_free_space_cachep, info);
1632 if (!info->bitmap && info->offset == offset) {
1633 unlink_free_space(block_group, info);
1634 info->offset += bytes;
1635 info->bytes -= bytes;
1636 link_free_space(block_group, info);
1640 if (!info->bitmap && info->offset <= offset &&
1641 info->offset + info->bytes >= offset + bytes) {
1642 u64 old_start = info->offset;
1644 * we're freeing space in the middle of the info,
1645 * this can happen during tree log replay
1647 * first unlink the old info and then
1648 * insert it again after the hole we're creating
1650 unlink_free_space(block_group, info);
1651 if (offset + bytes < info->offset + info->bytes) {
1652 u64 old_end = info->offset + info->bytes;
1654 info->offset = offset + bytes;
1655 info->bytes = old_end - info->offset;
1656 ret = link_free_space(block_group, info);
1661 /* the hole we're creating ends at the end
1662 * of the info struct, just free the info
1664 kmem_cache_free(btrfs_free_space_cachep, info);
1666 spin_unlock(&block_group->tree_lock);
1668 /* step two, insert a new info struct to cover
1669 * anything before the hole
1671 ret = btrfs_add_free_space(block_group, old_start,
1672 offset - old_start);
1677 ret = remove_from_bitmap(block_group, info, &offset, &bytes);
1682 spin_unlock(&block_group->tree_lock);
1687 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
1690 struct btrfs_free_space *info;
1694 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
1695 info = rb_entry(n, struct btrfs_free_space, offset_index);
1696 if (info->bytes >= bytes)
1698 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
1699 (unsigned long long)info->offset,
1700 (unsigned long long)info->bytes,
1701 (info->bitmap) ? "yes" : "no");
1703 printk(KERN_INFO "block group has cluster?: %s\n",
1704 list_empty(&block_group->cluster_list) ? "no" : "yes");
1705 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
1709 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
1711 struct btrfs_free_space *info;
1715 for (n = rb_first(&block_group->free_space_offset); n;
1717 info = rb_entry(n, struct btrfs_free_space, offset_index);
1725 * for a given cluster, put all of its extents back into the free
1726 * space cache. If the block group passed doesn't match the block group
1727 * pointed to by the cluster, someone else raced in and freed the
1728 * cluster already. In that case, we just return without changing anything
1731 __btrfs_return_cluster_to_free_space(
1732 struct btrfs_block_group_cache *block_group,
1733 struct btrfs_free_cluster *cluster)
1735 struct btrfs_free_space *entry;
1736 struct rb_node *node;
1738 spin_lock(&cluster->lock);
1739 if (cluster->block_group != block_group)
1742 cluster->block_group = NULL;
1743 cluster->window_start = 0;
1744 list_del_init(&cluster->block_group_list);
1746 node = rb_first(&cluster->root);
1750 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1751 node = rb_next(&entry->offset_index);
1752 rb_erase(&entry->offset_index, &cluster->root);
1754 bitmap = (entry->bitmap != NULL);
1756 try_merge_free_space(block_group, entry, false);
1757 tree_insert_offset(&block_group->free_space_offset,
1758 entry->offset, &entry->offset_index, bitmap);
1760 cluster->root = RB_ROOT;
1763 spin_unlock(&cluster->lock);
1764 btrfs_put_block_group(block_group);
1768 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
1770 struct btrfs_free_space *info;
1771 struct rb_node *node;
1772 struct btrfs_free_cluster *cluster;
1773 struct list_head *head;
1775 spin_lock(&block_group->tree_lock);
1776 while ((head = block_group->cluster_list.next) !=
1777 &block_group->cluster_list) {
1778 cluster = list_entry(head, struct btrfs_free_cluster,
1781 WARN_ON(cluster->block_group != block_group);
1782 __btrfs_return_cluster_to_free_space(block_group, cluster);
1783 if (need_resched()) {
1784 spin_unlock(&block_group->tree_lock);
1786 spin_lock(&block_group->tree_lock);
1790 while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
1791 info = rb_entry(node, struct btrfs_free_space, offset_index);
1792 if (!info->bitmap) {
1793 unlink_free_space(block_group, info);
1794 kmem_cache_free(btrfs_free_space_cachep, info);
1796 free_bitmap(block_group, info);
1799 if (need_resched()) {
1800 spin_unlock(&block_group->tree_lock);
1802 spin_lock(&block_group->tree_lock);
1806 spin_unlock(&block_group->tree_lock);
1809 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
1810 u64 offset, u64 bytes, u64 empty_size)
1812 struct btrfs_free_space *entry = NULL;
1813 u64 bytes_search = bytes + empty_size;
1816 spin_lock(&block_group->tree_lock);
1817 entry = find_free_space(block_group, &offset, &bytes_search, 0);
1822 if (entry->bitmap) {
1823 bitmap_clear_bits(block_group, entry, offset, bytes);
1825 free_bitmap(block_group, entry);
1827 unlink_free_space(block_group, entry);
1828 entry->offset += bytes;
1829 entry->bytes -= bytes;
1831 kmem_cache_free(btrfs_free_space_cachep, entry);
1833 link_free_space(block_group, entry);
1837 spin_unlock(&block_group->tree_lock);
1843 * given a cluster, put all of its extents back into the free space
1844 * cache. If a block group is passed, this function will only free
1845 * a cluster that belongs to the passed block group.
1847 * Otherwise, it'll get a reference on the block group pointed to by the
1848 * cluster and remove the cluster from it.
1850 int btrfs_return_cluster_to_free_space(
1851 struct btrfs_block_group_cache *block_group,
1852 struct btrfs_free_cluster *cluster)
1856 /* first, get a safe pointer to the block group */
1857 spin_lock(&cluster->lock);
1859 block_group = cluster->block_group;
1861 spin_unlock(&cluster->lock);
1864 } else if (cluster->block_group != block_group) {
1865 /* someone else has already freed it don't redo their work */
1866 spin_unlock(&cluster->lock);
1869 atomic_inc(&block_group->count);
1870 spin_unlock(&cluster->lock);
1872 /* now return any extents the cluster had on it */
1873 spin_lock(&block_group->tree_lock);
1874 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
1875 spin_unlock(&block_group->tree_lock);
1877 /* finally drop our ref */
1878 btrfs_put_block_group(block_group);
1882 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
1883 struct btrfs_free_cluster *cluster,
1884 struct btrfs_free_space *entry,
1885 u64 bytes, u64 min_start)
1888 u64 search_start = cluster->window_start;
1889 u64 search_bytes = bytes;
1892 search_start = min_start;
1893 search_bytes = bytes;
1895 err = search_bitmap(block_group, entry, &search_start,
1901 bitmap_clear_bits(block_group, entry, ret, bytes);
1907 * given a cluster, try to allocate 'bytes' from it, returns 0
1908 * if it couldn't find anything suitably large, or a logical disk offset
1909 * if things worked out
1911 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1912 struct btrfs_free_cluster *cluster, u64 bytes,
1915 struct btrfs_free_space *entry = NULL;
1916 struct rb_node *node;
1919 spin_lock(&cluster->lock);
1920 if (bytes > cluster->max_size)
1923 if (cluster->block_group != block_group)
1926 node = rb_first(&cluster->root);
1930 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1932 if (entry->bytes < bytes ||
1933 (!entry->bitmap && entry->offset < min_start)) {
1934 struct rb_node *node;
1936 node = rb_next(&entry->offset_index);
1939 entry = rb_entry(node, struct btrfs_free_space,
1944 if (entry->bitmap) {
1945 ret = btrfs_alloc_from_bitmap(block_group,
1946 cluster, entry, bytes,
1949 struct rb_node *node;
1950 node = rb_next(&entry->offset_index);
1953 entry = rb_entry(node, struct btrfs_free_space,
1959 ret = entry->offset;
1961 entry->offset += bytes;
1962 entry->bytes -= bytes;
1965 if (entry->bytes == 0)
1966 rb_erase(&entry->offset_index, &cluster->root);
1970 spin_unlock(&cluster->lock);
1975 spin_lock(&block_group->tree_lock);
1977 block_group->free_space -= bytes;
1978 if (entry->bytes == 0) {
1979 block_group->free_extents--;
1980 if (entry->bitmap) {
1981 kfree(entry->bitmap);
1982 block_group->total_bitmaps--;
1983 recalculate_thresholds(block_group);
1985 kmem_cache_free(btrfs_free_space_cachep, entry);
1988 spin_unlock(&block_group->tree_lock);
1993 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1994 struct btrfs_free_space *entry,
1995 struct btrfs_free_cluster *cluster,
1996 u64 offset, u64 bytes, u64 min_bytes)
1998 unsigned long next_zero;
2000 unsigned long search_bits;
2001 unsigned long total_bits;
2002 unsigned long found_bits;
2003 unsigned long start = 0;
2004 unsigned long total_found = 0;
2008 i = offset_to_bit(entry->offset, block_group->sectorsize,
2009 max_t(u64, offset, entry->offset));
2010 search_bits = bytes_to_bits(bytes, block_group->sectorsize);
2011 total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
2015 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
2016 i < BITS_PER_BITMAP;
2017 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
2018 next_zero = find_next_zero_bit(entry->bitmap,
2019 BITS_PER_BITMAP, i);
2020 if (next_zero - i >= search_bits) {
2021 found_bits = next_zero - i;
2035 total_found += found_bits;
2037 if (cluster->max_size < found_bits * block_group->sectorsize)
2038 cluster->max_size = found_bits * block_group->sectorsize;
2040 if (total_found < total_bits) {
2041 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
2042 if (i - start > total_bits * 2) {
2044 cluster->max_size = 0;
2050 cluster->window_start = start * block_group->sectorsize +
2052 rb_erase(&entry->offset_index, &block_group->free_space_offset);
2053 ret = tree_insert_offset(&cluster->root, entry->offset,
2054 &entry->offset_index, 1);
2061 * This searches the block group for just extents to fill the cluster with.
2063 static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2064 struct btrfs_free_cluster *cluster,
2065 u64 offset, u64 bytes, u64 min_bytes)
2067 struct btrfs_free_space *first = NULL;
2068 struct btrfs_free_space *entry = NULL;
2069 struct btrfs_free_space *prev = NULL;
2070 struct btrfs_free_space *last;
2071 struct rb_node *node;
2075 u64 max_gap = 128 * 1024;
2077 entry = tree_search_offset(block_group, offset, 0, 1);
2082 * We don't want bitmaps, so just move along until we find a normal
2085 while (entry->bitmap) {
2086 node = rb_next(&entry->offset_index);
2089 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2092 window_start = entry->offset;
2093 window_free = entry->bytes;
2094 max_extent = entry->bytes;
2099 while (window_free <= min_bytes) {
2100 node = rb_next(&entry->offset_index);
2103 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2108 * we haven't filled the empty size and the window is
2109 * very large. reset and try again
2111 if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
2112 entry->offset - window_start > (min_bytes * 2)) {
2114 window_start = entry->offset;
2115 window_free = entry->bytes;
2117 max_extent = entry->bytes;
2120 window_free += entry->bytes;
2121 if (entry->bytes > max_extent)
2122 max_extent = entry->bytes;
2127 cluster->window_start = first->offset;
2129 node = &first->offset_index;
2132 * now we've found our entries, pull them out of the free space
2133 * cache and put them into the cluster rbtree
2138 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2139 node = rb_next(&entry->offset_index);
2143 rb_erase(&entry->offset_index, &block_group->free_space_offset);
2144 ret = tree_insert_offset(&cluster->root, entry->offset,
2145 &entry->offset_index, 0);
2147 } while (node && entry != last);
2149 cluster->max_size = max_extent;
2155 * This specifically looks for bitmaps that may work in the cluster, we assume
2156 * that we have already failed to find extents that will work.
2158 static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2159 struct btrfs_free_cluster *cluster,
2160 u64 offset, u64 bytes, u64 min_bytes)
2162 struct btrfs_free_space *entry;
2163 struct rb_node *node;
2166 if (block_group->total_bitmaps == 0)
2169 entry = tree_search_offset(block_group,
2170 offset_to_bitmap(block_group, offset),
2175 node = &entry->offset_index;
2177 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2178 node = rb_next(&entry->offset_index);
2181 if (entry->bytes < min_bytes)
2183 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2185 } while (ret && node);
2191 * here we try to find a cluster of blocks in a block group. The goal
2192 * is to find at least bytes free and up to empty_size + bytes free.
2193 * We might not find them all in one contiguous area.
2195 * returns zero and sets up cluster if things worked out, otherwise
2196 * it returns -enospc
2198 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2199 struct btrfs_root *root,
2200 struct btrfs_block_group_cache *block_group,
2201 struct btrfs_free_cluster *cluster,
2202 u64 offset, u64 bytes, u64 empty_size)
2207 /* for metadata, allow allocates with more holes */
2208 if (btrfs_test_opt(root, SSD_SPREAD)) {
2209 min_bytes = bytes + empty_size;
2210 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2212 * we want to do larger allocations when we are
2213 * flushing out the delayed refs, it helps prevent
2214 * making more work as we go along.
2216 if (trans->transaction->delayed_refs.flushing)
2217 min_bytes = max(bytes, (bytes + empty_size) >> 1);
2219 min_bytes = max(bytes, (bytes + empty_size) >> 4);
2221 min_bytes = max(bytes, (bytes + empty_size) >> 2);
2223 spin_lock(&block_group->tree_lock);
2226 * If we know we don't have enough space to make a cluster don't even
2227 * bother doing all the work to try and find one.
2229 if (block_group->free_space < min_bytes) {
2230 spin_unlock(&block_group->tree_lock);
2234 spin_lock(&cluster->lock);
2236 /* someone already found a cluster, hooray */
2237 if (cluster->block_group) {
2242 ret = setup_cluster_no_bitmap(block_group, cluster, offset, bytes,
2245 ret = setup_cluster_bitmap(block_group, cluster, offset,
2249 atomic_inc(&block_group->count);
2250 list_add_tail(&cluster->block_group_list,
2251 &block_group->cluster_list);
2252 cluster->block_group = block_group;
2255 spin_unlock(&cluster->lock);
2256 spin_unlock(&block_group->tree_lock);
2262 * simple code to zero out a cluster
2264 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2266 spin_lock_init(&cluster->lock);
2267 spin_lock_init(&cluster->refill_lock);
2268 cluster->root = RB_ROOT;
2269 cluster->max_size = 0;
2270 INIT_LIST_HEAD(&cluster->block_group_list);
2271 cluster->block_group = NULL;
2274 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2275 u64 *trimmed, u64 start, u64 end, u64 minlen)
2277 struct btrfs_free_space *entry = NULL;
2278 struct btrfs_fs_info *fs_info = block_group->fs_info;
2280 u64 actually_trimmed;
2285 while (start < end) {
2286 spin_lock(&block_group->tree_lock);
2288 if (block_group->free_space < minlen) {
2289 spin_unlock(&block_group->tree_lock);
2293 entry = tree_search_offset(block_group, start, 0, 1);
2295 entry = tree_search_offset(block_group,
2296 offset_to_bitmap(block_group,
2300 if (!entry || entry->offset >= end) {
2301 spin_unlock(&block_group->tree_lock);
2305 if (entry->bitmap) {
2306 ret = search_bitmap(block_group, entry, &start, &bytes);
2309 spin_unlock(&block_group->tree_lock);
2312 bytes = min(bytes, end - start);
2313 bitmap_clear_bits(block_group, entry,
2315 if (entry->bytes == 0)
2316 free_bitmap(block_group, entry);
2318 start = entry->offset + BITS_PER_BITMAP *
2319 block_group->sectorsize;
2320 spin_unlock(&block_group->tree_lock);
2325 start = entry->offset;
2326 bytes = min(entry->bytes, end - start);
2327 unlink_free_space(block_group, entry);
2328 kmem_cache_free(btrfs_free_space_cachep, entry);
2331 spin_unlock(&block_group->tree_lock);
2333 if (bytes >= minlen) {
2335 update_ret = btrfs_update_reserved_bytes(block_group,
2338 ret = btrfs_error_discard_extent(fs_info->extent_root,
2343 btrfs_add_free_space(block_group,
2346 btrfs_update_reserved_bytes(block_group,
2351 *trimmed += actually_trimmed;
2356 if (fatal_signal_pending(current)) {