Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 14 Dec 2010 19:08:13 +0000 (11:08 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 14 Dec 2010 19:08:13 +0000 (11:08 -0800)
* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable:
  Btrfs: prevent RAID level downgrades when space is low
  Btrfs: account for missing devices in RAID allocation profiles
  Btrfs: EIO when we fail to read tree roots
  Btrfs: fix compiler warnings
  Btrfs: Make async snapshot ioctl more generic
  Btrfs: pwrite blocked when writing from the mmaped buffer of the same page
  Btrfs: Fix a crash when mounting a subvolume
  Btrfs: fix sync subvol/snapshot creation
  Btrfs: Fix page leak in compressed writeback path
  Btrfs: do not BUG if we fail to remove the orphan item for dead snapshots
  Btrfs: fixup return code for btrfs_del_orphan_item
  Btrfs: do not do fast caching if we are allocating blocks for tree_root
  Btrfs: deal with space cache errors better
  Btrfs: fix use after free in O_DIRECT

1  2 
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/inode.c
fs/btrfs/super.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h

diff --combined fs/btrfs/disk-io.c
@@@ -696,6 -696,7 +696,7 @@@ static int btree_submit_bio_hook(struc
                                   __btree_submit_bio_done);
  }
  
+ #ifdef CONFIG_MIGRATION
  static int btree_migratepage(struct address_space *mapping,
                        struct page *newpage, struct page *page)
  {
        if (page_has_private(page) &&
            !try_to_release_page(page, GFP_KERNEL))
                return -EAGAIN;
- #ifdef CONFIG_MIGRATION
        return migrate_page(mapping, newpage, page);
- #else
-       return -ENOSYS;
- #endif
  }
+ #endif
  
  static int btree_writepage(struct page *page, struct writeback_control *wbc)
  {
@@@ -1009,7 -1007,10 +1007,10 @@@ static int find_and_setup_root(struct b
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
-       BUG_ON(!root->node);
+       if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
+               free_extent_buffer(root->node);
+               return -EIO;
+       }
        root->commit_root = btrfs_root_node(root);
        return 0;
  }
@@@ -2093,7 -2094,7 +2094,7 @@@ static void btrfs_end_buffer_write_sync
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
 -              if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
 +              if (printk_ratelimit()) {
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
                                       bdevname(bh->b_bdev, b));
@@@ -2230,10 -2231,21 +2231,10 @@@ static int write_dev_supers(struct btrf
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
  
 -              if (i == last_barrier && do_barriers && device->barriers) {
 -                      ret = submit_bh(WRITE_BARRIER, bh);
 -                      if (ret == -EOPNOTSUPP) {
 -                              printk("btrfs: disabling barriers on dev %s\n",
 -                                     device->name);
 -                              set_buffer_uptodate(bh);
 -                              device->barriers = 0;
 -                              /* one reference for submit_bh */
 -                              get_bh(bh);
 -                              lock_buffer(bh);
 -                              ret = submit_bh(WRITE_SYNC, bh);
 -                      }
 -              } else {
 +              if (i == last_barrier && do_barriers)
 +                      ret = submit_bh(WRITE_FLUSH_FUA, bh);
 +              else
                        ret = submit_bh(WRITE_SYNC, bh);
 -              }
  
                if (ret)
                        errors++;
diff --combined fs/btrfs/extent-tree.c
@@@ -429,6 -429,7 +429,7 @@@ err
  
  static int cache_block_group(struct btrfs_block_group_cache *cache,
                             struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
                             int load_cache_only)
  {
        struct btrfs_fs_info *fs_info = cache->fs_info;
  
        /*
         * We can't do the read from on-disk cache during a commit since we need
-        * to have the normal tree locking.
+        * to have the normal tree locking.  Also if we are currently trying to
+        * allocate blocks for the tree root we can't do the fast caching since
+        * we likely hold important locks.
         */
-       if (!trans->transaction->in_commit) {
+       if (!trans->transaction->in_commit &&
+           (root && root != root->fs_info->tree_root)) {
                spin_lock(&cache->lock);
                if (cache->cached != BTRFS_CACHE_NO) {
                        spin_unlock(&cache->lock);
@@@ -1742,7 -1746,8 +1746,7 @@@ static int remove_extent_backref(struc
  static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
  {
 -      blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
 -                      BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 +      blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
  }
  
  static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@@ -2741,6 -2746,7 +2745,7 @@@ static int cache_save_setup(struct btrf
        struct btrfs_root *root = block_group->fs_info->tree_root;
        struct inode *inode = NULL;
        u64 alloc_hint = 0;
+       int dcs = BTRFS_DC_ERROR;
        int num_pages = 0;
        int retries = 0;
        int ret = 0;
@@@ -2795,6 -2801,8 +2800,8 @@@ again
  
        spin_lock(&block_group->lock);
        if (block_group->cached != BTRFS_CACHE_FINISHED) {
+               /* We're not cached, don't bother trying to write stuff out */
+               dcs = BTRFS_DC_WRITTEN;
                spin_unlock(&block_group->lock);
                goto out_put;
        }
        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                              num_pages, num_pages,
                                              &alloc_hint);
+       if (!ret)
+               dcs = BTRFS_DC_SETUP;
        btrfs_free_reserved_data_space(inode, num_pages);
  out_put:
        iput(inode);
@@@ -2828,10 -2838,7 +2837,7 @@@ out_free
        btrfs_release_path(root, path);
  out:
        spin_lock(&block_group->lock);
-       if (ret)
-               block_group->disk_cache_state = BTRFS_DC_ERROR;
-       else
-               block_group->disk_cache_state = BTRFS_DC_SETUP;
+       block_group->disk_cache_state = dcs;
        spin_unlock(&block_group->lock);
  
        return ret;
@@@ -3037,7 -3044,13 +3043,13 @@@ static void set_avail_alloc_bits(struc
  
  u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
  {
-       u64 num_devices = root->fs_info->fs_devices->rw_devices;
+       /*
+        * we add in the count of missing devices because we want
+        * to make sure that any RAID levels on a degraded FS
+        * continue to be honored.
+        */
+       u64 num_devices = root->fs_info->fs_devices->rw_devices +
+               root->fs_info->fs_devices->missing_devices;
  
        if (num_devices == 1)
                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@@ -4080,7 -4093,7 +4092,7 @@@ static int update_block_group(struct bt
                 * space back to the block group, otherwise we will leak space.
                 */
                if (!alloc && cache->cached == BTRFS_CACHE_NO)
-                       cache_block_group(cache, trans, 1);
+                       cache_block_group(cache, trans, NULL, 1);
  
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
@@@ -4930,11 -4943,31 +4942,31 @@@ search
                btrfs_get_block_group(block_group);
                search_start = block_group->key.objectid;
  
+               /*
+                * this can happen if we end up cycling through all the
+                * raid types, but we want to make sure we only allocate
+                * for the proper type.
+                */
+               if (!block_group_bits(block_group, data)) {
+                   u64 extra = BTRFS_BLOCK_GROUP_DUP |
+                               BTRFS_BLOCK_GROUP_RAID1 |
+                               BTRFS_BLOCK_GROUP_RAID10;
+                       /*
+                        * if they asked for extra copies and this block group
+                        * doesn't provide them, bail.  This does allow us to
+                        * fill raid0 from raid1.
+                        */
+                       if ((data & extra) && !(block_group->flags & extra))
+                               goto loop;
+               }
  have_block_group:
                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
                        u64 free_percent;
  
-                       ret = cache_block_group(block_group, trans, 1);
+                       ret = cache_block_group(block_group, trans,
+                                               orig_root, 1);
                        if (block_group->cached == BTRFS_CACHE_FINISHED)
                                goto have_block_group;
  
                        if (loop > LOOP_CACHING_NOWAIT ||
                            (loop > LOOP_FIND_IDEAL &&
                             atomic_read(&space_info->caching_threads) < 2)) {
-                               ret = cache_block_group(block_group, trans, 0);
+                               ret = cache_block_group(block_group, trans,
+                                                       orig_root, 0);
                                BUG_ON(ret);
                        }
                        found_uncached_bg = true;
@@@ -5515,7 -5549,7 +5548,7 @@@ int btrfs_alloc_logged_file_extent(stru
        u64 num_bytes = ins->offset;
  
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-       cache_block_group(block_group, trans, 0);
+       cache_block_group(block_group, trans, NULL, 0);
        caching_ctl = get_caching_control(block_group);
  
        if (!caching_ctl) {
@@@ -6300,9 -6334,13 +6333,13 @@@ int btrfs_drop_snapshot(struct btrfs_ro
                                           NULL, NULL);
                BUG_ON(ret < 0);
                if (ret > 0) {
-                       ret = btrfs_del_orphan_item(trans, tree_root,
-                                                   root->root_key.objectid);
-                       BUG_ON(ret);
+                       /* if we fail to delete the orphan item this time
+                        * around, it'll get picked up the next time.
+                        *
+                        * The most common failure here is just -ENOENT.
+                        */
+                       btrfs_del_orphan_item(trans, tree_root,
+                                             root->root_key.objectid);
                }
        }
  
@@@ -7878,7 -7916,14 +7915,14 @@@ static u64 update_block_group_flags(str
        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
  
-       num_devices = root->fs_info->fs_devices->rw_devices;
+       /*
+        * we add in the count of missing devices because we want
+        * to make sure that any RAID levels on a degraded FS
+        * continue to be honored.
+        */
+       num_devices = root->fs_info->fs_devices->rw_devices +
+               root->fs_info->fs_devices->missing_devices;
        if (num_devices == 1) {
                stripped |= BTRFS_BLOCK_GROUP_DUP;
                stripped = flags & ~stripped;
@@@ -8247,7 -8292,6 +8291,6 @@@ int btrfs_read_block_groups(struct btrf
                        break;
                if (ret != 0)
                        goto error;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                cache = kzalloc(sizeof(*cache), GFP_NOFS);
diff --combined fs/btrfs/inode.c
@@@ -495,7 -495,7 +495,7 @@@ again
                add_async_extent(async_cow, start, num_bytes,
                                 total_compressed, pages, nr_pages_ret);
  
-               if (start + num_bytes < end && start + num_bytes < actual_end) {
+               if (start + num_bytes < end) {
                        start += num_bytes;
                        pages = NULL;
                        cond_resched();
@@@ -3877,7 -3877,7 +3877,7 @@@ again
        p = &root->inode_tree.rb_node;
        parent = NULL;
  
 -      if (hlist_unhashed(&inode->i_hash))
 +      if (inode_unhashed(inode))
                return;
  
        spin_lock(&root->inode_lock);
@@@ -4802,7 -4802,7 +4802,7 @@@ static int btrfs_link(struct dentry *ol
        }
  
        btrfs_set_trans_block_group(trans, dir);
 -      atomic_inc(&inode->i_count);
 +      ihold(inode);
  
        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
  
@@@ -5712,9 -5712,9 +5712,9 @@@ static void btrfs_end_dio_bio(struct bi
  
        if (err) {
                printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu "
-                     "disk_bytenr %lu len %u err no %d\n",
-                     dip->inode->i_ino, bio->bi_rw, bio->bi_sector,
-                     bio->bi_size, err);
+                     "sector %#Lx len %u err no %d\n",
+                     dip->inode->i_ino, bio->bi_rw,
+                     (unsigned long long)bio->bi_sector, bio->bi_size, err);
                dip->errors = 1;
  
                /*
@@@ -5934,8 -5934,7 +5934,7 @@@ free_ordered
         */
        if (write) {
                struct btrfs_ordered_extent *ordered;
-               ordered = btrfs_lookup_ordered_extent(inode,
-                                                     dip->logical_offset);
+               ordered = btrfs_lookup_ordered_extent(inode, file_offset);
                if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
                        btrfs_free_reserved_extent(root, ordered->start,
diff --combined fs/btrfs/super.c
@@@ -589,8 -589,8 +589,8 @@@ static int btrfs_set_super(struct super
   * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
   *      for multiple device setup.  Make sure to keep it in sync.
   */
 -static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 -              const char *dev_name, void *data, struct vfsmount *mnt)
 +static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 +              const char *dev_name, void *data)
  {
        struct block_device *bdev = NULL;
        struct super_block *s;
                                          &subvol_name, &subvol_objectid,
                                          &fs_devices);
        if (error)
 -              return error;
 +              return ERR_PTR(error);
  
        error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
        if (error)
                mutex_unlock(&root->d_inode->i_mutex);
  
                if (IS_ERR(new_root)) {
+                       dput(root);
                        deactivate_locked_super(s);
                        error = PTR_ERR(new_root);
-                       dput(root);
                        goto error_free_subvol_name;
                }
                if (!new_root->d_inode) {
                root = new_root;
        }
  
 -      mnt->mnt_sb = s;
 -      mnt->mnt_root = root;
 -
        kfree(subvol_name);
 -      return 0;
 +      return root;
  
  error_s:
        error = PTR_ERR(s);
@@@ -712,7 -715,7 +712,7 @@@ error_close_devices
        kfree(tree_root);
  error_free_subvol_name:
        kfree(subvol_name);
 -      return error;
 +      return ERR_PTR(error);
  }
  
  static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@@ -796,7 -799,7 +796,7 @@@ static int btrfs_statfs(struct dentry *
  static struct file_system_type btrfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "btrfs",
 -      .get_sb         = btrfs_get_sb,
 +      .mount          = btrfs_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = FS_REQUIRES_DEV,
  };
@@@ -865,7 -868,6 +865,7 @@@ static const struct file_operations btr
        .unlocked_ioctl  = btrfs_control_ioctl,
        .compat_ioctl = btrfs_control_ioctl,
        .owner   = THIS_MODULE,
 +      .llseek = noop_llseek,
  };
  
  static struct miscdevice btrfs_misc = {
diff --combined fs/btrfs/volumes.c
@@@ -398,6 -398,7 +398,6 @@@ static noinline int device_list_add(con
                device->work.func = pending_bios_fn;
                memcpy(device->uuid, disk_super->dev_item.uuid,
                       BTRFS_UUID_SIZE);
 -              device->barriers = 1;
                spin_lock_init(&device->io_lock);
                device->name = kstrdup(path, GFP_NOFS);
                if (!device->name) {
  
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
-       } else if (strcmp(device->name, path)) {
+       } else if (!device->name || strcmp(device->name, path)) {
                name = kstrdup(path, GFP_NOFS);
                if (!name)
                        return -ENOMEM;
                kfree(device->name);
                device->name = name;
+               if (device->missing) {
+                       fs_devices->missing_devices--;
+                       device->missing = 0;
+               }
        }
  
        if (found_transid > fs_devices->latest_trans) {
@@@ -461,6 -466,7 +465,6 @@@ static struct btrfs_fs_devices *clone_f
                device->devid = orig_dev->devid;
                device->work.func = pending_bios_fn;
                memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
 -              device->barriers = 1;
                spin_lock_init(&device->io_lock);
                INIT_LIST_HEAD(&device->dev_list);
                INIT_LIST_HEAD(&device->dev_alloc_list);
@@@ -1236,6 -1242,9 +1240,9 @@@ int btrfs_rm_device(struct btrfs_root *
  
        device->fs_devices->num_devices--;
  
+       if (device->missing)
+               root->fs_info->fs_devices->missing_devices--;
        next_device = list_entry(root->fs_info->fs_devices->devices.next,
                                 struct btrfs_device, dev_list);
        if (device->bdev == root->fs_info->sb->s_bdev)
@@@ -1487,6 -1496,7 +1494,6 @@@ int btrfs_init_new_device(struct btrfs_
        trans = btrfs_start_transaction(root, 0);
        lock_chunks(root);
  
 -      device->barriers = 1;
        device->writeable = 1;
        device->work.func = pending_bios_fn;
        generate_random_uuid(device->uuid);
@@@ -3076,11 -3086,14 +3083,13 @@@ static struct btrfs_device *add_missing
                return NULL;
        list_add(&device->dev_list,
                 &fs_devices->devices);
 -      device->barriers = 1;
        device->dev_root = root->fs_info->dev_root;
        device->devid = devid;
        device->work.func = pending_bios_fn;
        device->fs_devices = fs_devices;
+       device->missing = 1;
        fs_devices->num_devices++;
+       fs_devices->missing_devices++;
        spin_lock_init(&device->io_lock);
        INIT_LIST_HEAD(&device->dev_alloc_list);
        memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@@ -3278,6 -3291,15 +3287,15 @@@ static int read_one_dev(struct btrfs_ro
                        device = add_missing_dev(root, devid, dev_uuid);
                        if (!device)
                                return -ENOMEM;
+               } else if (!device->missing) {
+                       /*
+                        * this happens when a device that was properly setup
+                        * in the device info lists suddenly goes bad.
+                        * device->bdev is NULL, and so we have to set
+                        * device->missing to one here
+                        */
+                       root->fs_info->fs_devices->missing_devices++;
+                       device->missing = 1;
                }
        }
  
diff --combined fs/btrfs/volumes.h
@@@ -42,8 -42,10 +42,9 @@@ struct btrfs_device 
        int running_pending;
        u64 generation;
  
 -      int barriers;
        int writeable;
        int in_fs_metadata;
+       int missing;
  
        spinlock_t io_lock;
  
@@@ -93,6 -95,7 +94,7 @@@ struct btrfs_fs_devices 
        u64 num_devices;
        u64 open_devices;
        u64 rw_devices;
+       u64 missing_devices;
        u64 total_rw_bytes;
        struct block_device *latest_bdev;