X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=fs%2Fbtrfs%2Fdisk-io.c;h=ffc363d2fb24ada506b1baa8cd15530e8d058ec7;hb=051e1b9f748ae673b7325d3fc049bb838606cffa;hp=3e16cca72b4962d04d82308b1c2cd28caa65c13d;hpb=324ae4df00fdc1a6a179bf584d8addf027bb75fb;p=pandora-kernel.git diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3e16cca72b49..ffc363d2fb24 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -18,17 +18,21 @@ #include #include -#include #include #include #include #include #include // for block_sync_page +#include +#include "crc32c.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" +#include "volumes.h" #include "print-tree.h" +#include "async-thread.h" +#include "locking.h" #if 0 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) @@ -43,67 +47,98 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) } #endif -static struct extent_map_ops btree_extent_map_ops; - -struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize) -{ - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_buffer *eb; - eb = find_extent_buffer(&BTRFS_I(btree_inode)->extent_tree, - bytenr, blocksize, GFP_NOFS); - return eb; -} - -struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize) -{ - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_buffer *eb; +static struct extent_io_ops btree_extent_io_ops; +static void end_workqueue_fn(struct btrfs_work *work); + +struct end_io_wq { + struct bio *bio; + bio_end_io_t *end_io; + void *private; + struct btrfs_fs_info *info; + int error; + int metadata; + struct list_head list; + struct btrfs_work work; +}; - eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->extent_tree, - bytenr, blocksize, NULL, GFP_NOFS); - return eb; -} +struct async_submit_bio { + struct inode *inode; + struct bio *bio; + struct list_head list; + extent_submit_bio_hook_t *submit_bio_hook; + int rw; + int mirror_num; + struct btrfs_work work; +}; struct extent_map *btree_get_extent(struct inode *inode, struct page *page, - size_t page_offset, u64 start, u64 end, + size_t page_offset, u64 start, u64 len, int create) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; int ret; -again: - em = lookup_extent_mapping(em_tree, start, end); + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); if (em) { + em->bdev = + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + spin_unlock(&em_tree->lock); goto out; } + spin_unlock(&em_tree->lock); + em = alloc_extent_map(GFP_NOFS); if (!em) { em = ERR_PTR(-ENOMEM); goto out; } em->start = 0; - em->end = (i_size_read(inode) & ~((u64)PAGE_CACHE_SIZE -1)) - 1; + em->len = (u64)-1; em->block_start = 0; - em->block_end = em->end; - em->bdev = inode->i_sb->s_bdev; + em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + spin_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); if (ret == -EEXIST) { + u64 failed_start = em->start; + u64 failed_len = em->len; + + printk("failed to insert %Lu %Lu -> %Lu into tree\n", + em->start, em->len, em->block_start); free_extent_map(em); - em = NULL; - goto again; + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + printk("after failing, found %Lu %Lu %Lu\n", + em->start, em->len, em->block_start); + ret = 0; + } else { + em = lookup_extent_mapping(em_tree, failed_start, + failed_len); + if (em) { + printk("double failure lookup gives us " + "%Lu %Lu -> %Lu\n", em->start, + em->len, em->block_start); + free_extent_map(em); + } + ret = -EIO; + } } else if (ret) { - em = ERR_PTR(ret); + free_extent_map(em); + em = NULL; } + spin_unlock(&em_tree->lock); + + if (ret) + em = ERR_PTR(ret); out: return em; } u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) { - return crc32c(seed, data, len); + return btrfs_crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, char *result) @@ -145,10 +180,26 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, btrfs_csum_final(crc, result); if (verify) { + int from_this_trans = 0; + + if (root->fs_info->running_transaction && + btrfs_header_generation(buf) == + root->fs_info->running_transaction->transid) + from_this_trans = 1; + + /* FIXME, this is not good */ if (memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) { - printk("btrfs: %s checksum verify failed on %llu\n", + u32 val; + u32 found = 0; + memcpy(&found, result, BTRFS_CRC32_SIZE); + + read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE); + printk("btrfs: %s checksum verify failed on %llu " + "wanted %X found %X from_this_trans %d " + "level %d\n", root->fs_info->sb->s_id, - buf->start); + buf->start, val, found, from_this_trans, + btrfs_header_level(buf)); return 1; } } else { @@ -157,16 +208,73 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, return 0; } +static int verify_parent_transid(struct extent_io_tree *io_tree, + struct extent_buffer *eb, u64 parent_transid) +{ + int ret; + + if (!parent_transid || btrfs_header_generation(eb) == parent_transid) + return 0; + + lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); + if (extent_buffer_uptodate(io_tree, eb) && + btrfs_header_generation(eb) == parent_transid) { + ret = 0; + goto out; + } + printk("parent transid verify failed on %llu wanted %llu found %llu\n", + (unsigned long long)eb->start, + (unsigned long long)parent_transid, + (unsigned long long)btrfs_header_generation(eb)); + ret = 1; +out: + clear_extent_buffer_uptodate(io_tree, eb); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + return ret; + +} + +static int btree_read_extent_buffer_pages(struct btrfs_root *root, + struct extent_buffer *eb, + u64 start, u64 parent_transid) +{ + struct extent_io_tree *io_tree; + int ret; + int num_copies = 0; + int mirror_num = 0; + + io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + while (1) { + ret = read_extent_buffer_pages(io_tree, eb, start, 1, + btree_get_extent, mirror_num); + if (!ret && + !verify_parent_transid(io_tree, eb, parent_transid)) + return ret; + + num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, + eb->start, eb->len); + if (num_copies == 1) + return ret; + + mirror_num++; + if (mirror_num > num_copies) + return ret; + } + return -EIO; +} int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { - struct extent_map_tree *tree; + struct extent_io_tree *tree; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 found_start; int found_level; unsigned long len; struct extent_buffer *eb; - tree = &BTRFS_I(page->mapping->host)->extent_tree; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; if (page->private == EXTENT_PAGE_PRIVATE) goto out; @@ -177,14 +285,34 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page) WARN_ON(1); } eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); - read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1); + ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, + btrfs_header_generation(eb)); + BUG_ON(ret); + btrfs_clear_buffer_defrag(eb); found_start = btrfs_header_bytenr(eb); if (found_start != start) { printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n", start, found_start, len); + WARN_ON(1); + goto err; + } + if (eb->first_page != page) { + printk("bad first page %lu %lu\n", eb->first_page->index, + page->index); + WARN_ON(1); + goto err; + } + if (!PageUptodate(page)) { + printk("csum not up to date page %lu\n", page->index); + WARN_ON(1); + goto err; } found_level = btrfs_header_level(eb); + spin_lock(&root->fs_info->hash_lock); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + spin_unlock(&root->fs_info->hash_lock); csum_tree_block(root, eb, 0); +err: free_extent_buffer(eb); out: return 0; @@ -198,36 +326,245 @@ static int btree_writepage_io_hook(struct page *page, u64 start, u64 end) return 0; } +int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct extent_io_tree *tree; + u64 found_start; + int found_level; + unsigned long len; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + int ret = 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + len = page->private >> 2; + if (len == 0) { + WARN_ON(1); + } + eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + + btrfs_clear_buffer_defrag(eb); + found_start = btrfs_header_bytenr(eb); + if (found_start != start) { + ret = -EIO; + goto err; + } + if (eb->first_page != page) { + printk("bad first page %lu %lu\n", eb->first_page->index, + page->index); + WARN_ON(1); + ret = -EIO; + goto err; + } + if (memcmp_extent_buffer(eb, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(eb), + BTRFS_FSID_SIZE)) { + printk("bad fsid on block %Lu\n", eb->start); + ret = -EIO; + goto err; + } + found_level = btrfs_header_level(eb); + + ret = csum_tree_block(root, eb, 1); + if (ret) + ret = -EIO; + + end = min_t(u64, eb->len, PAGE_CACHE_SIZE); + end = eb->start + end - 1; + release_extent_buffer_tail_pages(eb); +err: + free_extent_buffer(eb); +out: + return ret; +} + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) +static void end_workqueue_bio(struct bio *bio, int err) +#else +static int end_workqueue_bio(struct bio *bio, + unsigned int bytes_done, int err) +#endif +{ + struct end_io_wq *end_io_wq = bio->bi_private; + struct btrfs_fs_info *fs_info; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + if (bio->bi_size) + return 1; +#endif + + fs_info = end_io_wq->info; + end_io_wq->error = err; + end_io_wq->work.func = end_workqueue_fn; + end_io_wq->work.flags = 0; + btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work); + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + return 0; +#endif +} + +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + int metadata) +{ + struct end_io_wq *end_io_wq; + end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); + if (!end_io_wq) + return -ENOMEM; + + end_io_wq->private = bio->bi_private; + end_io_wq->end_io = bio->bi_end_io; + end_io_wq->info = info; + end_io_wq->error = 0; + end_io_wq->bio = bio; + end_io_wq->metadata = metadata; + + bio->bi_private = end_io_wq; + bio->bi_end_io = end_workqueue_bio; + return 0; +} + +static void run_one_async_submit(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + atomic_dec(&fs_info->nr_async_submits); + async->submit_bio_hook(async->inode, async->rw, async->bio, + async->mirror_num); + kfree(async); +} + +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + extent_submit_bio_hook_t *submit_bio_hook) +{ + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->inode = inode; + async->rw = rw; + async->bio = bio; + async->mirror_num = mirror_num; + async->submit_bio_hook = submit_bio_hook; + async->work.func = run_one_async_submit; + async->work.flags = 0; + atomic_inc(&fs_info->nr_async_submits); + btrfs_queue_worker(&fs_info->workers, &async->work); + return 0; +} + +static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 offset; + int ret; + + offset = bio->bi_sector << 9; + + /* + * when we're called for a write, we're already in the async + * submission context. Just jump ingo btrfs_map_bio + */ + if (rw & (1 << BIO_RW)) { + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } + + /* + * called for a read, do the setup so that checksum validation + * can happen in the async kernel threads + */ + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1); + BUG_ON(ret); + + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); +} + +static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num) +{ + /* + * kthread helpers are used to submit writes so that checksumming + * can happen in parallel across all CPUs + */ + if (!(rw & (1 << BIO_RW))) { + return __btree_submit_bio_hook(inode, rw, bio, mirror_num); + } + return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, + __btree_submit_bio_hook); +} + static int btree_writepage(struct page *page, struct writeback_control *wbc) { - struct extent_map_tree *tree; - tree = &BTRFS_I(page->mapping->host)->extent_tree; + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; return extent_write_full_page(tree, page, btree_get_extent, wbc); } static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_map_tree *tree; - tree = &BTRFS_I(mapping->host)->extent_tree; + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; + if (wbc->sync_mode == WB_SYNC_NONE) { + u64 num_dirty; + u64 start = 0; + unsigned long thresh = 96 * 1024 * 1024; + + if (wbc->for_kupdate) + return 0; + + if (current_is_pdflush()) { + thresh = 96 * 1024 * 1024; + } else { + thresh = 8 * 1024 * 1024; + } + num_dirty = count_range_bits(tree, &start, (u64)-1, + thresh, EXTENT_DIRTY); + if (num_dirty < thresh) { + return 0; + } + } return extent_writepages(tree, mapping, btree_get_extent, wbc); } int btree_readpage(struct file *file, struct page *page) { - struct extent_map_tree *tree; - tree = &BTRFS_I(page->mapping->host)->extent_tree; + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; return extent_read_full_page(tree, page, btree_get_extent); } -static int btree_releasepage(struct page *page, gfp_t unused_gfp_flags) +static int btree_releasepage(struct page *page, gfp_t gfp_flags) { - struct extent_map_tree *tree; + struct extent_io_tree *tree; + struct extent_map_tree *map; int ret; - tree = &BTRFS_I(page->mapping->host)->extent_tree; - ret = try_release_extent_mapping(tree, page); + if (page_count(page) > 3) { + /* once for page->private, once for the caller, once + * once for the page cache + */ + return 0; + } + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + ret = try_release_extent_state(map, tree, page, gfp_flags); if (ret == 1) { + invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE); ClearPagePrivate(page); set_page_private(page, 0); page_cache_release(page); @@ -237,10 +574,16 @@ static int btree_releasepage(struct page *page, gfp_t unused_gfp_flags) static void btree_invalidatepage(struct page *page, unsigned long offset) { - struct extent_map_tree *tree; - tree = &BTRFS_I(page->mapping->host)->extent_tree; + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; extent_invalidatepage(tree, page, offset); btree_releasepage(page, GFP_NOFS); + if (PagePrivate(page)) { + invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE); + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } } #if 0 @@ -273,7 +616,8 @@ static struct address_space_operations btree_aops = { .sync_page = block_sync_page, }; -int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, + u64 parent_transid) { struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; @@ -282,48 +626,67 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) buf = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!buf) return 0; - read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree, - buf, 0, 0); + read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, + buf, 0, 0, btree_get_extent, 0); free_extent_buffer(buf); return ret; } +struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, + bytenr, blocksize, GFP_NOFS); + return eb; +} + +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + + eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, + bytenr, blocksize, NULL, GFP_NOFS); + return eb; +} + + struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, - u32 blocksize) + u32 blocksize, u64 parent_transid) { struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_map_tree *extent_tree; + struct extent_io_tree *io_tree; int ret; - extent_tree = &BTRFS_I(btree_inode)->extent_tree; + io_tree = &BTRFS_I(btree_inode)->io_tree; buf = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!buf) return NULL; - read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree, - buf, 0, 1); - if (buf->flags & EXTENT_CSUM) { - return buf; - } - if (test_range_bit(extent_tree, buf->start, buf->start + buf->len - 1, - EXTENT_CSUM, 1)) { - buf->flags |= EXTENT_CSUM; - return buf; - } - ret = csum_tree_block(root, buf, 1); - set_extent_bits(extent_tree, buf->start, - buf->start + buf->len - 1, - EXTENT_CSUM, GFP_NOFS); - buf->flags |= EXTENT_CSUM; + + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + + if (ret == 0) { + buf->flags |= EXTENT_UPTODATE; + } return buf; + } int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { struct inode *btree_inode = root->fs_info->btree_inode; - clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf); + if (btrfs_header_generation(buf) == + root->fs_info->running_transaction->transid) { + WARN_ON(!btrfs_tree_locked(buf)); + clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, + buf); + } return 0; } @@ -331,13 +694,13 @@ int wait_on_tree_block_writeback(struct btrfs_root *root, struct extent_buffer *buf) { struct inode *btree_inode = root->fs_info->btree_inode; - wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->extent_tree, + wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->io_tree, buf); return 0; } static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, - struct btrfs_root *root, + u32 stripesize, struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { @@ -347,19 +710,26 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->sectorsize = sectorsize; root->nodesize = nodesize; root->leafsize = leafsize; + root->stripesize = stripesize; root->ref_cows = 0; + root->track_dirty = 0; + root->fs_info = fs_info; root->objectid = objectid; root->last_trans = 0; root->highest_inode = 0; root->last_inode_alloc = 0; root->name = NULL; + root->in_sysfs = 0; + + INIT_LIST_HEAD(&root->dirty_list); + spin_lock_init(&root->node_lock); + mutex_init(&root->objectid_mutex); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); memset(&root->root_kobj, 0, sizeof(root->root_kobj)); init_completion(&root->kobj_unregister); - init_rwsem(&root->snap_sem); root->defrag_running = 0; root->defrag_level = 0; root->root_key.objectid = objectid; @@ -375,14 +745,15 @@ static int find_and_setup_root(struct btrfs_root *tree_root, u32 blocksize; __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, root, fs_info, objectid); + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, objectid); ret = btrfs_find_last_root(tree_root, objectid, &root->root_item, &root->root_key); BUG_ON(ret); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), - blocksize); + blocksize, 0); BUG_ON(!root->node); return 0; } @@ -412,8 +783,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info, } __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, root, fs_info, - location->objectid); + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, location->objectid); path = btrfs_alloc_path(); BUG_ON(!path); @@ -438,7 +809,7 @@ out: } blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), - blocksize); + blocksize, 0); BUG_ON(!root->node); insert: root->ref_cows = 1; @@ -450,13 +821,36 @@ insert: return root; } -struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, - struct btrfs_key *location, - const char *name, int namelen) +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid) +{ + struct btrfs_root *root; + + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID) + return fs_info->tree_root; + if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID) + return fs_info->extent_root; + + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_objectid); + return root; +} + +struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location) { struct btrfs_root *root; int ret; + if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) + return fs_info->tree_root; + if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) + return fs_info->extent_root; + if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) + return fs_info->chunk_root; + if (location->objectid == BTRFS_DEV_TREE_OBJECTID) + return fs_info->dev_root; + root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)location->objectid); if (root) @@ -473,6 +867,26 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, kfree(root); return ERR_PTR(ret); } + ret = btrfs_find_dead_roots(fs_info->tree_root, + root->root_key.objectid, root); + BUG_ON(ret); + + return root; +} + +struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + const char *name, int namelen) +{ + struct btrfs_root *root; + int ret; + + root = btrfs_read_fs_root_no_name(fs_info, location); + if (!root) + return NULL; + + if (root->in_sysfs) + return root; ret = btrfs_set_root_name(root, name, namelen); if (ret) { @@ -488,11 +902,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, kfree(root); return ERR_PTR(ret); } - - ret = btrfs_find_dead_roots(fs_info->tree_root, - root->root_key.objectid, root); - BUG_ON(ret); - + root->in_sysfs = 1; return root; } #if 0 @@ -513,20 +923,206 @@ static int add_hasher(struct btrfs_fs_info *info, char *type) { return 0; } #endif -struct btrfs_root *open_ctree(struct super_block *sb) + +static int btrfs_congested_fn(void *congested_data, int bdi_bits) +{ + struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; + int ret = 0; + int limit = 256 * info->fs_devices->open_devices; + struct list_head *cur; + struct btrfs_device *device; + struct backing_dev_info *bdi; + + if ((bdi_bits & (1 << BDI_write_congested)) && + atomic_read(&info->nr_async_submits) > limit) { + return 1; + } + + list_for_each(cur, &info->fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (!device->bdev) + continue; + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi && bdi_congested(bdi, bdi_bits)) { + ret = 1; + break; + } + } + return ret; +} + +/* + * this unplugs every device on the box, and it is only used when page + * is null + */ +static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ + struct list_head *cur; + struct btrfs_device *device; + struct btrfs_fs_info *info; + + info = (struct btrfs_fs_info *)bdi->unplug_io_data; + list_for_each(cur, &info->fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi->unplug_io_fn) { + bdi->unplug_io_fn(bdi, page); + } + } +} + +void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ + struct inode *inode; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct address_space *mapping; + u64 offset; + + /* the generic O_DIRECT read code does this */ + if (!page) { + __unplug_io_fn(bdi, page); + return; + } + + /* + * page->mapping may change at any time. Get a consistent copy + * and use that for everything below + */ + smp_mb(); + mapping = page->mapping; + if (!mapping) + return; + + inode = mapping->host; + offset = page_offset(page); + + em_tree = &BTRFS_I(inode)->extent_tree; + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + if (!em) + return; + + offset = offset - em->start; + btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, + em->block_start + offset, page); + free_extent_map(em); +} + +static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) +{ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) + bdi_init(bdi); +#endif + bdi->ra_pages = default_backing_dev_info.ra_pages; + bdi->state = 0; + bdi->capabilities = default_backing_dev_info.capabilities; + bdi->unplug_io_fn = btrfs_unplug_io_fn; + bdi->unplug_io_data = info; + bdi->congested_fn = btrfs_congested_fn; + bdi->congested_data = info; + return 0; +} + +static int bio_ready_for_csum(struct bio *bio) +{ + u64 length = 0; + u64 buf_len = 0; + u64 start = 0; + struct page *page; + struct extent_io_tree *io_tree = NULL; + struct btrfs_fs_info *info = NULL; + struct bio_vec *bvec; + int i; + int ret; + + bio_for_each_segment(bvec, bio, i) { + page = bvec->bv_page; + if (page->private == EXTENT_PAGE_PRIVATE) { + length += bvec->bv_len; + continue; + } + if (!page->private) { + length += bvec->bv_len; + continue; + } + length = bvec->bv_len; + buf_len = page->private >> 2; + start = page_offset(page) + bvec->bv_offset; + io_tree = &BTRFS_I(page->mapping->host)->io_tree; + info = BTRFS_I(page->mapping->host)->root->fs_info; + } + /* are we fully contained in this bio? */ + if (buf_len <= length) + return 1; + + ret = extent_range_uptodate(io_tree, start + length, + start + buf_len - 1); + if (ret == 1) + return ret; + return ret; +} + +/* + * called by the kthread helper functions to finally call the bio end_io + * functions. This is where read checksum verification actually happens + */ +static void end_workqueue_fn(struct btrfs_work *work) +{ + struct bio *bio; + struct end_io_wq *end_io_wq; + struct btrfs_fs_info *fs_info; + int error; + + end_io_wq = container_of(work, struct end_io_wq, work); + bio = end_io_wq->bio; + fs_info = end_io_wq->info; + + /* metadata bios are special because the whole tree block must + * be checksummed at once. This makes sure the entire block is in + * ram and up to date before trying to verify things. For + * blocksize <= pagesize, it is basically a noop + */ + if (end_io_wq->metadata && !bio_ready_for_csum(bio)) { + btrfs_queue_worker(&fs_info->endio_workers, + &end_io_wq->work); + return; + } + error = end_io_wq->error; + bio->bi_private = end_io_wq->private; + bio->bi_end_io = end_io_wq->end_io; + kfree(end_io_wq); +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + bio_endio(bio, bio->bi_size, error); +#else + bio_endio(bio, error); +#endif +} + +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options) { u32 sectorsize; u32 nodesize; u32 leafsize; u32 blocksize; + u32 stripesize; + struct buffer_head *bh; struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root), GFP_NOFS); struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root), GFP_NOFS); - struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info), + struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), + GFP_NOFS); + struct btrfs_root *chunk_root = kmalloc(sizeof(struct btrfs_root), GFP_NOFS); + struct btrfs_root *dev_root = kmalloc(sizeof(struct btrfs_root), + GFP_NOFS); int ret; - int err = -EIO; + int err = -EINVAL; + struct btrfs_super_block *disk_super; if (!extent_root || !tree_root || !fs_info) { @@ -538,40 +1134,66 @@ struct btrfs_root *open_ctree(struct super_block *sb) INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->hashers); spin_lock_init(&fs_info->hash_lock); + spin_lock_init(&fs_info->delalloc_lock); + spin_lock_init(&fs_info->new_trans_lock); - memset(&fs_info->super_kobj, 0, sizeof(fs_info->super_kobj)); init_completion(&fs_info->kobj_unregister); - sb_set_blocksize(sb, 4096); - fs_info->running_transaction = NULL; - fs_info->last_trans_committed = 0; fs_info->tree_root = tree_root; fs_info->extent_root = extent_root; + fs_info->chunk_root = chunk_root; + fs_info->dev_root = dev_root; + fs_info->fs_devices = fs_devices; + INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); + INIT_LIST_HEAD(&fs_info->space_info); + btrfs_mapping_init(&fs_info->mapping_tree); + atomic_set(&fs_info->nr_async_submits, 0); + atomic_set(&fs_info->throttles, 0); fs_info->sb = sb; + fs_info->max_extent = (u64)-1; + fs_info->max_inline = 8192 * 1024; + setup_bdi(fs_info, &fs_info->bdi); fs_info->btree_inode = new_inode(sb); fs_info->btree_inode->i_ino = 1; fs_info->btree_inode->i_nlink = 1; - fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size; + fs_info->thread_pool_size = min(num_online_cpus() + 2, 8); + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = blksize_bits(4096); + + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of + * the devices in the system + */ + fs_info->btree_inode->i_size = OFFSET_MAX; fs_info->btree_inode->i_mapping->a_ops = &btree_aops; - extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, + fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; + + extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, fs_info->btree_inode->i_mapping, GFP_NOFS); - BTRFS_I(fs_info->btree_inode)->extent_tree.ops = &btree_extent_map_ops; + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, + GFP_NOFS); + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; - extent_map_tree_init(&fs_info->free_space_cache, + extent_io_tree_init(&fs_info->free_space_cache, fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_map_tree_init(&fs_info->block_group_cache, + extent_io_tree_init(&fs_info->block_group_cache, fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_map_tree_init(&fs_info->pinned_extents, + extent_io_tree_init(&fs_info->pinned_extents, fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_map_tree_init(&fs_info->pending_del, + extent_io_tree_init(&fs_info->pending_del, fs_info->btree_inode->i_mapping, GFP_NOFS); - extent_map_tree_init(&fs_info->extent_ins, + extent_io_tree_init(&fs_info->extent_ins, fs_info->btree_inode->i_mapping, GFP_NOFS); fs_info->do_barriers = 1; - fs_info->closing = 0; - fs_info->total_pinned = 0; +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) + INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info); +#else INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner); +#endif BTRFS_I(fs_info->btree_inode)->root = tree_root; memset(&BTRFS_I(fs_info->btree_inode)->location, 0, sizeof(struct btrfs_key)); @@ -579,7 +1201,9 @@ struct btrfs_root *open_ctree(struct super_block *sb) mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); mutex_init(&fs_info->trans_mutex); - mutex_init(&fs_info->fs_mutex); + mutex_init(&fs_info->drop_mutex); + mutex_init(&fs_info->alloc_mutex); + mutex_init(&fs_info->chunk_mutex); #if 0 ret = add_hasher(fs_info, "crc32c"); @@ -589,36 +1213,66 @@ struct btrfs_root *open_ctree(struct super_block *sb) goto fail_iput; } #endif - __setup_root(512, 512, 512, tree_root, + __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); - fs_info->sb_buffer = read_tree_block(tree_root, - BTRFS_SUPER_INFO_OFFSET, - 512); - if (!fs_info->sb_buffer) + bh = __bread(fs_devices->latest_bdev, + BTRFS_SUPER_INFO_OFFSET / 4096, 4096); + if (!bh) goto fail_iput; - read_extent_buffer(fs_info->sb_buffer, &fs_info->super_copy, 0, - sizeof(fs_info->super_copy)); + memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); + brelse(bh); + + memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); - read_extent_buffer(fs_info->sb_buffer, fs_info->fsid, - (unsigned long)btrfs_super_fsid(fs_info->sb_buffer), - BTRFS_FSID_SIZE); disk_super = &fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_sb_buffer; + err = btrfs_parse_options(tree_root, options); + if (err) + goto fail_sb_buffer; + + /* + * we need to start all the end_io workers up front because the + * queue work function gets called at interrupt time, and so it + * cannot dynamically grow. + */ + btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size); + btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->submit_workers, 1); + btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); + + + err = -EINVAL; + if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) { + printk("Btrfs: wanted %llu devices, but found %llu\n", + (unsigned long long)btrfs_super_num_devices(disk_super), + (unsigned long long)fs_devices->open_devices); + if (btrfs_test_opt(tree_root, DEGRADED)) + printk("continuing in degraded mode\n"); + else { + goto fail_sb_buffer; + } + } + + fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); + nodesize = btrfs_super_nodesize(disk_super); leafsize = btrfs_super_leafsize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = btrfs_super_stripesize(disk_super); tree_root->nodesize = nodesize; tree_root->leafsize = leafsize; tree_root->sectorsize = sectorsize; - sb_set_blocksize(sb, sectorsize); + tree_root->stripesize = stripesize; - i_size_write(fs_info->btree_inode, - btrfs_super_total_bytes(disk_super)); + sb->s_blocksize = sectorsize; + sb->s_blocksize_bits = blksize_bits(sectorsize); if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, sizeof(disk_super->magic))) { @@ -626,53 +1280,240 @@ struct btrfs_root *open_ctree(struct super_block *sb) goto fail_sb_buffer; } + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_sys_array(tree_root); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk("btrfs: failed to read the system array on %s\n", + sb->s_id); + goto fail_sys_array; + } + + blocksize = btrfs_level_size(tree_root, + btrfs_super_chunk_root_level(disk_super)); + + __setup_root(nodesize, leafsize, sectorsize, stripesize, + chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); + + chunk_root->node = read_tree_block(chunk_root, + btrfs_super_chunk_root(disk_super), + blocksize, 0); + BUG_ON(!chunk_root->node); + + read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), + BTRFS_UUID_SIZE); + + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_chunk_tree(chunk_root); + mutex_unlock(&fs_info->chunk_mutex); + BUG_ON(ret); + + btrfs_close_extra_devices(fs_devices); + blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); + tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), - blocksize); + blocksize, 0); if (!tree_root->node) goto fail_sb_buffer; - mutex_lock(&fs_info->fs_mutex); ret = find_and_setup_root(tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); - if (ret) { - mutex_unlock(&fs_info->fs_mutex); + if (ret) goto fail_tree_root; - } + extent_root->track_dirty = 1; + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_DEV_TREE_OBJECTID, dev_root); + dev_root->track_dirty = 1; + + if (ret) + goto fail_extent_root; btrfs_read_block_groups(extent_root); fs_info->generation = btrfs_super_generation(disk_super) + 1; - mutex_unlock(&fs_info->fs_mutex); + fs_info->data_alloc_profile = (u64)-1; + fs_info->metadata_alloc_profile = (u64)-1; + fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + return tree_root; +fail_extent_root: + free_extent_buffer(extent_root->node); fail_tree_root: free_extent_buffer(tree_root->node); +fail_sys_array: fail_sb_buffer: - free_extent_buffer(fs_info->sb_buffer); + extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->submit_workers); fail_iput: iput(fs_info->btree_inode); fail: + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + kfree(extent_root); kfree(tree_root); +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) + bdi_destroy(&fs_info->bdi); +#endif kfree(fs_info); return ERR_PTR(err); } +static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (uptodate) { + set_buffer_uptodate(bh); + } else { + if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { + printk(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + } + /* note, we dont' set_buffer_write_io_error because we have + * our own ways of dealing with the IO errors + */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +int write_all_supers(struct btrfs_root *root) +{ + struct list_head *cur; + struct list_head *head = &root->fs_info->fs_devices->devices; + struct btrfs_device *dev; + struct btrfs_super_block *sb; + struct btrfs_dev_item *dev_item; + struct buffer_head *bh; + int ret; + int do_barriers; + int max_errors; + int total_errors = 0; + u32 crc; + u64 flags; + + max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + do_barriers = !btrfs_test_opt(root, NOBARRIER); + + sb = &root->fs_info->super_for_commit; + dev_item = &sb->dev_item; + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (!dev->bdev) { + total_errors++; + continue; + } + if (!dev->in_fs_metadata) + continue; + + btrfs_set_stack_device_type(dev_item, dev->type); + btrfs_set_stack_device_id(dev_item, dev->devid); + btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); + btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); + btrfs_set_stack_device_io_align(dev_item, dev->io_align); + btrfs_set_stack_device_io_width(dev_item, dev->io_width); + btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); + memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); + flags = btrfs_super_flags(sb); + btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); + + + crc = ~(u32)0; + crc = btrfs_csum_data(root, (char *)sb + BTRFS_CSUM_SIZE, crc, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, sb->csum); + + bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / 4096, + BTRFS_SUPER_INFO_SIZE); + + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + dev->pending_io = bh; + + get_bh(bh); + set_buffer_uptodate(bh); + lock_buffer(bh); + bh->b_end_io = btrfs_end_buffer_write_sync; + + if (do_barriers && dev->barriers) { + ret = submit_bh(WRITE_BARRIER, bh); + if (ret == -EOPNOTSUPP) { + printk("btrfs: disabling barriers on dev %s\n", + dev->name); + set_buffer_uptodate(bh); + dev->barriers = 0; + get_bh(bh); + lock_buffer(bh); + ret = submit_bh(WRITE, bh); + } + } else { + ret = submit_bh(WRITE, bh); + } + if (ret) + total_errors++; + } + if (total_errors > max_errors) { + printk("btrfs: %d errors while writing supers\n", total_errors); + BUG(); + } + total_errors = 0; + + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (!dev->bdev) + continue; + if (!dev->in_fs_metadata) + continue; + + BUG_ON(!dev->pending_io); + bh = dev->pending_io; + wait_on_buffer(bh); + if (!buffer_uptodate(dev->pending_io)) { + if (do_barriers && dev->barriers) { + printk("btrfs: disabling barriers on dev %s\n", + dev->name); + set_buffer_uptodate(bh); + get_bh(bh); + lock_buffer(bh); + dev->barriers = 0; + ret = submit_bh(WRITE, bh); + BUG_ON(ret); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + total_errors++; + } else { + total_errors++; + } + + } + dev->pending_io = NULL; + brelse(bh); + } + if (total_errors > max_errors) { + printk("btrfs: %d errors while writing supers\n", total_errors); + BUG(); + } + return 0; +} + int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; - struct extent_buffer *super = root->fs_info->sb_buffer; - struct inode *btree_inode = root->fs_info->btree_inode; - set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, super); - ret = sync_page_range_nolock(btree_inode, btree_inode->i_mapping, - super->start, super->len); + ret = write_all_supers(root); return ret; } @@ -680,7 +1521,8 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); - btrfs_sysfs_del_root(root); + if (root->in_sysfs) + btrfs_sysfs_del_root(root); if (root->inode) iput(root->inode); if (root->node) @@ -718,8 +1560,9 @@ int close_ctree(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; fs_info->closing = 1; + smp_mb(); + btrfs_transaction_flush_work(root); - mutex_lock(&fs_info->fs_mutex); btrfs_defrag_dirty_roots(root->fs_info); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); @@ -728,21 +1571,45 @@ int close_ctree(struct btrfs_root *root) btrfs_commit_transaction(trans, root); ret = btrfs_write_and_wait_transaction(NULL, root); BUG_ON(ret); + write_ctree_super(NULL, root); - mutex_unlock(&fs_info->fs_mutex); + btrfs_transaction_flush_work(root); + + if (fs_info->delalloc_bytes) { + printk("btrfs: at unmount delalloc count %Lu\n", + fs_info->delalloc_bytes); + } if (fs_info->extent_root->node) free_extent_buffer(fs_info->extent_root->node); if (fs_info->tree_root->node) free_extent_buffer(fs_info->tree_root->node); - free_extent_buffer(fs_info->sb_buffer); + if (root->fs_info->chunk_root->node); + free_extent_buffer(root->fs_info->chunk_root->node); + + if (root->fs_info->dev_root->node); + free_extent_buffer(root->fs_info->dev_root->node); btrfs_free_block_groups(root->fs_info); del_fs_roots(fs_info); - extent_map_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->extent_tree); + + filemap_write_and_wait(fs_info->btree_inode->i_mapping); + + extent_io_tree_empty_lru(&fs_info->free_space_cache); + extent_io_tree_empty_lru(&fs_info->block_group_cache); + extent_io_tree_empty_lru(&fs_info->pinned_extents); + extent_io_tree_empty_lru(&fs_info->pending_del); + extent_io_tree_empty_lru(&fs_info->extent_ins); + extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); + truncate_inode_pages(fs_info->btree_inode->i_mapping, 0); + + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->submit_workers); + iput(fs_info->btree_inode); #if 0 while(!list_empty(&fs_info->hashers)) { @@ -754,21 +1621,38 @@ int close_ctree(struct btrfs_root *root) kfree(hasher); } #endif + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) + bdi_destroy(&fs_info->bdi); +#endif + kfree(fs_info->extent_root); kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); return 0; } -int btrfs_buffer_uptodate(struct extent_buffer *buf) +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) { + int ret; struct inode *btree_inode = buf->first_page->mapping->host; - return extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree, buf); + + ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); + if (!ret) + return ret; + + ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, + parent_transid); + return !ret; } int btrfs_set_buffer_uptodate(struct extent_buffer *buf) { struct inode *btree_inode = buf->first_page->mapping->host; - return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree, + return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); } @@ -778,26 +1662,61 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) u64 transid = btrfs_header_generation(buf); struct inode *btree_inode = root->fs_info->btree_inode; + WARN_ON(!btrfs_tree_locked(buf)); if (transid != root->fs_info->generation) { printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n", (unsigned long long)buf->start, transid, root->fs_info->generation); WARN_ON(1); } - set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf); + set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); +} + +void btrfs_throttle(struct btrfs_root *root) +{ + struct backing_dev_info *bdi; + + bdi = &root->fs_info->bdi; + if (atomic_read(&root->fs_info->throttles) && + bdi_write_congested(bdi)) { +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18) + congestion_wait(WRITE, HZ/20); +#else + blk_congestion_wait(WRITE, HZ/20); +#endif + + } } void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); + /* + * looks as though older kernels can get into trouble with + * this code, they end up stuck in balance_dirty_pages forever + */ + struct extent_io_tree *tree; + u64 num_dirty; + u64 start = 0; + unsigned long thresh = 16 * 1024 * 1024; + tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + + if (current_is_pdflush()) + return; + + num_dirty = count_range_bits(tree, &start, (u64)-1, + thresh, EXTENT_DIRTY); + if (num_dirty > thresh) { + balance_dirty_pages_ratelimited_nr( + root->fs_info->btree_inode->i_mapping, 1); + } + return; } void btrfs_set_buffer_defrag(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct inode *btree_inode = root->fs_info->btree_inode; - set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start, + set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS); } @@ -805,7 +1724,7 @@ void btrfs_set_buffer_defrag_done(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct inode *btree_inode = root->fs_info->btree_inode; - set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start, + set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG_DONE, GFP_NOFS); } @@ -814,7 +1733,7 @@ int btrfs_buffer_defrag(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct inode *btree_inode = root->fs_info->btree_inode; - return test_range_bit(&BTRFS_I(btree_inode)->extent_tree, + return test_range_bit(&BTRFS_I(btree_inode)->io_tree, buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0); } @@ -822,7 +1741,7 @@ int btrfs_buffer_defrag_done(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct inode *btree_inode = root->fs_info->btree_inode; - return test_range_bit(&BTRFS_I(btree_inode)->extent_tree, + return test_range_bit(&BTRFS_I(btree_inode)->io_tree, buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG_DONE, 0); } @@ -831,7 +1750,7 @@ int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct inode *btree_inode = root->fs_info->btree_inode; - return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree, + return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG_DONE, GFP_NOFS); } @@ -840,19 +1759,26 @@ int btrfs_clear_buffer_defrag(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct inode *btree_inode = root->fs_info->btree_inode; - return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree, + return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS); } -int btrfs_read_buffer(struct extent_buffer *buf) +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; - struct inode *btree_inode = root->fs_info->btree_inode; - return read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree, - buf, 0, 1); + int ret; + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + if (ret == 0) { + buf->flags |= EXTENT_UPTODATE; + } + return ret; } -static struct extent_map_ops btree_extent_map_ops = { +static struct extent_io_ops btree_extent_io_ops = { .writepage_io_hook = btree_writepage_io_hook, + .readpage_end_io_hook = btree_readpage_end_io_hook, + .submit_bio_hook = btree_submit_bio_hook, + /* note we're sharing with inode.c for the merge bio hook */ + .merge_bio_hook = btrfs_merge_bio_hook, };