Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 17 Jan 2011 22:43:43 +0000 (14:43 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 17 Jan 2011 22:43:43 +0000 (14:43 -0800)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: (25 commits)
  Btrfs: forced readonly mounts on errors
  btrfs: Require CAP_SYS_ADMIN for filesystem rebalance
  Btrfs: don't warn if we get ENOSPC in btrfs_block_rsv_check
  btrfs: Fix memory leak in btrfs_read_fs_root_no_radix()
  btrfs: check NULL or not
  btrfs: Don't pass NULL ptr to func that may deref it.
  btrfs: mount failure return value fix
  btrfs: Mem leak in btrfs_get_acl()
  btrfs: fix wrong free space information of btrfs
  btrfs: make the chunk allocator utilize the devices better
  btrfs: restructure find_free_dev_extent()
  btrfs: fix wrong calculation of stripe size
  btrfs: try to reclaim some space when chunk allocation fails
  btrfs: fix wrong data space statistics
  fs/btrfs: Fix build of ctree
  Btrfs: fix off by one while setting block groups readonly
  Btrfs: Add BTRFS_IOC_SUBVOL_GETFLAGS/SETFLAGS ioctls
  Btrfs: Add readonly snapshots support
  Btrfs: Refactor btrfs_ioctl_snap_create()
  btrfs: Extract duplicate decompress code
  ...

1  2 
fs/btrfs/acl.c
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/super.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h

diff --combined fs/btrfs/acl.c
@@@ -60,8 -60,10 +60,10 @@@ static struct posix_acl *btrfs_get_acl(
                size = __btrfs_getxattr(inode, name, value, size);
                if (size > 0) {
                        acl = posix_acl_from_xattr(value, size);
-                       if (IS_ERR(acl))
+                       if (IS_ERR(acl)) {
+                               kfree(value);
                                return acl;
+                       }
                        set_cached_acl(inode, type, acl);
                }
                kfree(value);
@@@ -185,23 -187,18 +187,23 @@@ static int btrfs_xattr_acl_set(struct d
        return ret;
  }
  
 -int btrfs_check_acl(struct inode *inode, int mask)
 +int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
  {
 -      struct posix_acl *acl;
        int error = -EAGAIN;
  
 -      acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
 +      if (flags & IPERM_FLAG_RCU) {
 +              if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
 +                      error = -ECHILD;
  
 -      if (IS_ERR(acl))
 -              return PTR_ERR(acl);
 -      if (acl) {
 -              error = posix_acl_permission(inode, acl, mask);
 -              posix_acl_release(acl);
 +      } else {
 +              struct posix_acl *acl;
 +              acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
 +              if (IS_ERR(acl))
 +                      return PTR_ERR(acl);
 +              if (acl) {
 +                      error = posix_acl_permission(inode, acl, mask);
 +                      posix_acl_release(acl);
 +              }
        }
  
        return error;
diff --combined fs/btrfs/ctree.h
@@@ -295,6 -295,14 +295,14 @@@ static inline unsigned long btrfs_chunk
  #define BTRFS_FSID_SIZE 16
  #define BTRFS_HEADER_FLAG_WRITTEN     (1ULL << 0)
  #define BTRFS_HEADER_FLAG_RELOC               (1ULL << 1)
+ /*
+  * File system states
+  */
+ /* Errors detected */
+ #define BTRFS_SUPER_FLAG_ERROR                (1ULL << 2)
  #define BTRFS_SUPER_FLAG_SEEDING      (1ULL << 32)
  #define BTRFS_SUPER_FLAG_METADUMP     (1ULL << 33)
  
@@@ -399,13 -407,15 +407,15 @@@ struct btrfs_super_block 
  #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF  (1ULL << 0)
  #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
  #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS   (1ULL << 2)
+ #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO   (1ULL << 3)
  
  #define BTRFS_FEATURE_COMPAT_SUPP             0ULL
  #define BTRFS_FEATURE_COMPAT_RO_SUPP          0ULL
  #define BTRFS_FEATURE_INCOMPAT_SUPP                   \
        (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |         \
         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |        \
-        BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+        BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |          \
+        BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
  
  /*
   * A leaf is full of items. offset and size tell us where to find
@@@ -552,9 -562,11 +562,11 @@@ struct btrfs_timespec 
  } __attribute__ ((__packed__));
  
  enum btrfs_compression_type {
-       BTRFS_COMPRESS_NONE = 0,
-       BTRFS_COMPRESS_ZLIB = 1,
-       BTRFS_COMPRESS_LAST = 2,
+       BTRFS_COMPRESS_NONE  = 0,
+       BTRFS_COMPRESS_ZLIB  = 1,
+       BTRFS_COMPRESS_LZO   = 2,
+       BTRFS_COMPRESS_TYPES = 2,
+       BTRFS_COMPRESS_LAST  = 3,
  };
  
  struct btrfs_inode_item {
@@@ -598,6 -610,8 +610,8 @@@ struct btrfs_dir_item 
        u8 type;
  } __attribute__ ((__packed__));
  
+ #define BTRFS_ROOT_SUBVOL_RDONLY      (1ULL << 0)
  struct btrfs_root_item {
        struct btrfs_inode_item inode;
        __le64 generation;
@@@ -896,7 -910,8 +910,8 @@@ struct btrfs_fs_info 
         */
        u64 last_trans_log_full_commit;
        u64 open_ioctl_trans;
-       unsigned long mount_opt;
+       unsigned long mount_opt:20;
+       unsigned long compress_type:4;
        u64 max_inline;
        u64 alloc_start;
        struct btrfs_transaction *running_transaction;
        unsigned metadata_ratio;
  
        void *bdev_holder;
+       /* filesystem state */
+       u64 fs_state;
  };
  
  /*
@@@ -1894,6 -1912,11 +1912,11 @@@ BTRFS_SETGET_STACK_FUNCS(root_limit, st
  BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
                         last_snapshot, 64);
  
+ static inline bool btrfs_root_readonly(struct btrfs_root *root)
+ {
+       return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
+ }
  /* struct btrfs_super_block */
  
  BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@@ -2146,6 -2169,7 +2169,7 @@@ int btrfs_make_block_group(struct btrfs
  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
  u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
  void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
  void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
  int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@@ -2189,6 -2213,12 +2213,12 @@@ int btrfs_set_block_group_ro(struct btr
  int btrfs_set_block_group_rw(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
  void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+ int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+                                  u64 start, u64 end);
+ int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+                              u64 num_bytes);
  /* ctree.c */
  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@@ -2542,10 -2572,18 +2572,18 @@@ ssize_t btrfs_listxattr(struct dentry *
  /* super.c */
  int btrfs_parse_options(struct btrfs_root *root, char *options);
  int btrfs_sync_fs(struct super_block *sb, int wait);
+ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+                    unsigned int line, int errno);
+ #define btrfs_std_error(fs_info, errno)                               \
+ do {                                                          \
+       if ((errno))                                            \
+               __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+ } while (0)
  
  /* acl.c */
  #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 -int btrfs_check_acl(struct inode *inode, int mask);
 +int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
  #else
  #define btrfs_check_acl NULL
  #endif
diff --combined fs/btrfs/disk-io.c
  static struct extent_io_ops btree_extent_io_ops;
  static void end_workqueue_fn(struct btrfs_work *work);
  static void free_fs_root(struct btrfs_root *root);
+ static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+                                   int read_only);
+ static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
+ static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+                                     struct btrfs_root *root);
+ static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+                                       struct extent_io_tree *dirty_pages,
+                                       int mark);
+ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+                                      struct extent_io_tree *pinned_extents);
+ static int btrfs_cleanup_transaction(struct btrfs_root *root);
  
  /*
   * end_io_wq structs are used to do processing in task context when an IO is
@@@ -353,6 -367,10 +367,10 @@@ static int csum_dirty_buffer(struct btr
        WARN_ON(len == 0);
  
        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+       if (eb == NULL) {
+               WARN_ON(1);
+               goto out;
+       }
        ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
                                             btrfs_header_generation(eb));
        BUG_ON(ret);
@@@ -427,6 -445,10 +445,10 @@@ static int btree_readpage_end_io_hook(s
        WARN_ON(len == 0);
  
        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+       if (eb == NULL) {
+               ret = -EIO;
+               goto out;
+       }
  
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
@@@ -1145,6 -1167,7 +1167,7 @@@ struct btrfs_root *btrfs_read_fs_root_n
        }
        btrfs_free_path(path);
        if (ret) {
+               kfree(root);
                if (ret > 0)
                        ret = -ENOENT;
                return ERR_PTR(ret);
@@@ -1713,8 -1736,10 +1736,10 @@@ struct btrfs_root *open_ctree(struct su
                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
  
        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
-       if (!bh)
+       if (!bh) {
+               err = -EINVAL;
                goto fail_iput;
+       }
  
        memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
        memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
        if (!btrfs_super_root(disk_super))
                goto fail_iput;
  
+       /* check FS state, whether FS is broken. */
+       fs_info->fs_state |= btrfs_super_flags(disk_super);
+       btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
        ret = btrfs_parse_options(tree_root, options);
        if (ret) {
                err = ret;
        }
  
        features = btrfs_super_incompat_flags(disk_super);
-       if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
-               features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
-               btrfs_set_super_incompat_flags(disk_super, features);
-       }
+       features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+       if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+               features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+       btrfs_set_super_incompat_flags(disk_super, features);
  
        features = btrfs_super_compat_ro_flags(disk_super) &
                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
                btrfs_set_opt(fs_info->mount_opt, SSD);
        }
  
-       if (btrfs_super_log_root(disk_super) != 0) {
+       /* do not make disk changes in broken FS */
+       if (btrfs_super_log_root(disk_super) != 0 &&
+           !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
                u64 bytenr = btrfs_super_log_root(disk_super);
  
                if (fs_devices->rw_devices == 0) {
@@@ -2094,7 -2126,7 +2126,7 @@@ static void btrfs_end_buffer_write_sync
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
 -              if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
 +              if (printk_ratelimit()) {
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
                                       bdevname(bh->b_bdev, b));
@@@ -2231,10 -2263,21 +2263,10 @@@ static int write_dev_supers(struct btrf
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
  
 -              if (i == last_barrier && do_barriers && device->barriers) {
 -                      ret = submit_bh(WRITE_BARRIER, bh);
 -                      if (ret == -EOPNOTSUPP) {
 -                              printk("btrfs: disabling barriers on dev %s\n",
 -                                     device->name);
 -                              set_buffer_uptodate(bh);
 -                              device->barriers = 0;
 -                              /* one reference for submit_bh */
 -                              get_bh(bh);
 -                              lock_buffer(bh);
 -                              ret = submit_bh(WRITE_SYNC, bh);
 -                      }
 -              } else {
 +              if (i == last_barrier && do_barriers)
 +                      ret = submit_bh(WRITE_FLUSH_FUA, bh);
 +              else
                        ret = submit_bh(WRITE_SYNC, bh);
 -              }
  
                if (ret)
                        errors++;
@@@ -2442,8 -2485,28 +2474,28 @@@ int close_ctree(struct btrfs_root *root
        smp_mb();
  
        btrfs_put_block_group_cache(fs_info);
+       /*
+        * Here come 2 situations when btrfs is broken to flip readonly:
+        *
+        * 1. when btrfs flips readonly somewhere else before
+        * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
+        * and btrfs will skip to write sb directly to keep
+        * ERROR state on disk.
+        *
+        * 2. when btrfs flips readonly just in btrfs_commit_super,
+        * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+        * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
+        * btrfs will cleanup all FS resources first and write sb then.
+        */
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
-               ret =  btrfs_commit_super(root);
+               ret = btrfs_commit_super(root);
+               if (ret)
+                       printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+       }
+       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+               ret = btrfs_error_commit_super(root);
                if (ret)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
        return 0;
  }
  
+ static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+                             int read_only)
+ {
+       if (read_only)
+               return;
+       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+               printk(KERN_WARNING "warning: mount fs with errors, "
+                      "running btrfsck is recommended\n");
+ }
+ int btrfs_error_commit_super(struct btrfs_root *root)
+ {
+       int ret;
+       mutex_lock(&root->fs_info->cleaner_mutex);
+       btrfs_run_delayed_iputs(root);
+       mutex_unlock(&root->fs_info->cleaner_mutex);
+       down_write(&root->fs_info->cleanup_work_sem);
+       up_write(&root->fs_info->cleanup_work_sem);
+       /* cleanup FS via transaction */
+       btrfs_cleanup_transaction(root);
+       ret = write_ctree_super(NULL, root, 0);
+       return ret;
+ }
+ static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+ {
+       struct btrfs_inode *btrfs_inode;
+       struct list_head splice;
+       INIT_LIST_HEAD(&splice);
+       mutex_lock(&root->fs_info->ordered_operations_mutex);
+       spin_lock(&root->fs_info->ordered_extent_lock);
+       list_splice_init(&root->fs_info->ordered_operations, &splice);
+       while (!list_empty(&splice)) {
+               btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                        ordered_operations);
+               list_del_init(&btrfs_inode->ordered_operations);
+               btrfs_invalidate_inodes(btrfs_inode->root);
+       }
+       spin_unlock(&root->fs_info->ordered_extent_lock);
+       mutex_unlock(&root->fs_info->ordered_operations_mutex);
+       return 0;
+ }
+ static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+ {
+       struct list_head splice;
+       struct btrfs_ordered_extent *ordered;
+       struct inode *inode;
+       INIT_LIST_HEAD(&splice);
+       spin_lock(&root->fs_info->ordered_extent_lock);
+       list_splice_init(&root->fs_info->ordered_extents, &splice);
+       while (!list_empty(&splice)) {
+               ordered = list_entry(splice.next, struct btrfs_ordered_extent,
+                                    root_extent_list);
+               list_del_init(&ordered->root_extent_list);
+               atomic_inc(&ordered->refs);
+               /* the inode may be getting freed (in sys_unlink path). */
+               inode = igrab(ordered->inode);
+               spin_unlock(&root->fs_info->ordered_extent_lock);
+               if (inode)
+                       iput(inode);
+               atomic_set(&ordered->refs, 1);
+               btrfs_put_ordered_extent(ordered);
+               spin_lock(&root->fs_info->ordered_extent_lock);
+       }
+       spin_unlock(&root->fs_info->ordered_extent_lock);
+       return 0;
+ }
+ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+                                     struct btrfs_root *root)
+ {
+       struct rb_node *node;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_node *ref;
+       int ret = 0;
+       delayed_refs = &trans->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       if (delayed_refs->num_entries == 0) {
+               printk(KERN_INFO "delayed_refs has NO entry\n");
+               return ret;
+       }
+       node = rb_first(&delayed_refs->root);
+       while (node) {
+               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+               node = rb_next(node);
+               ref->in_tree = 0;
+               rb_erase(&ref->rb_node, &delayed_refs->root);
+               delayed_refs->num_entries--;
+               atomic_set(&ref->refs, 1);
+               if (btrfs_delayed_ref_is_head(ref)) {
+                       struct btrfs_delayed_ref_head *head;
+                       head = btrfs_delayed_node_to_head(ref);
+                       mutex_lock(&head->mutex);
+                       kfree(head->extent_op);
+                       delayed_refs->num_heads--;
+                       if (list_empty(&head->cluster))
+                               delayed_refs->num_heads_ready--;
+                       list_del_init(&head->cluster);
+                       mutex_unlock(&head->mutex);
+               }
+               spin_unlock(&delayed_refs->lock);
+               btrfs_put_delayed_ref(ref);
+               cond_resched();
+               spin_lock(&delayed_refs->lock);
+       }
+       spin_unlock(&delayed_refs->lock);
+       return ret;
+ }
+ static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+ {
+       struct btrfs_pending_snapshot *snapshot;
+       struct list_head splice;
+       INIT_LIST_HEAD(&splice);
+       list_splice_init(&t->pending_snapshots, &splice);
+       while (!list_empty(&splice)) {
+               snapshot = list_entry(splice.next,
+                                     struct btrfs_pending_snapshot,
+                                     list);
+               list_del_init(&snapshot->list);
+               kfree(snapshot);
+       }
+       return 0;
+ }
+ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+ {
+       struct btrfs_inode *btrfs_inode;
+       struct list_head splice;
+       INIT_LIST_HEAD(&splice);
+       list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+       spin_lock(&root->fs_info->delalloc_lock);
+       while (!list_empty(&splice)) {
+               btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                   delalloc_inodes);
+               list_del_init(&btrfs_inode->delalloc_inodes);
+               btrfs_invalidate_inodes(btrfs_inode->root);
+       }
+       spin_unlock(&root->fs_info->delalloc_lock);
+       return 0;
+ }
+ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+                                       struct extent_io_tree *dirty_pages,
+                                       int mark)
+ {
+       int ret;
+       struct page *page;
+       struct inode *btree_inode = root->fs_info->btree_inode;
+       struct extent_buffer *eb;
+       u64 start = 0;
+       u64 end;
+       u64 offset;
+       unsigned long index;
+       while (1) {
+               ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+                                           mark);
+               if (ret)
+                       break;
+               clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+               while (start <= end) {
+                       index = start >> PAGE_CACHE_SHIFT;
+                       start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+                       page = find_get_page(btree_inode->i_mapping, index);
+                       if (!page)
+                               continue;
+                       offset = page_offset(page);
+                       spin_lock(&dirty_pages->buffer_lock);
+                       eb = radix_tree_lookup(
+                            &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
+                                              offset >> PAGE_CACHE_SHIFT);
+                       spin_unlock(&dirty_pages->buffer_lock);
+                       if (eb) {
+                               ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+                                                        &eb->bflags);
+                               atomic_set(&eb->refs, 1);
+                       }
+                       if (PageWriteback(page))
+                               end_page_writeback(page);
+                       lock_page(page);
+                       if (PageDirty(page)) {
+                               clear_page_dirty_for_io(page);
+                               spin_lock_irq(&page->mapping->tree_lock);
+                               radix_tree_tag_clear(&page->mapping->page_tree,
+                                                       page_index(page),
+                                                       PAGECACHE_TAG_DIRTY);
+                               spin_unlock_irq(&page->mapping->tree_lock);
+                       }
+                       page->mapping->a_ops->invalidatepage(page, 0);
+                       unlock_page(page);
+               }
+       }
+       return ret;
+ }
+ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+                                      struct extent_io_tree *pinned_extents)
+ {
+       struct extent_io_tree *unpin;
+       u64 start;
+       u64 end;
+       int ret;
+       unpin = pinned_extents;
+       while (1) {
+               ret = find_first_extent_bit(unpin, 0, &start, &end,
+                                           EXTENT_DIRTY);
+               if (ret)
+                       break;
+               /* opt_discard */
+               ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+               clear_extent_dirty(unpin, start, end, GFP_NOFS);
+               btrfs_error_unpin_extent_range(root, start, end);
+               cond_resched();
+       }
+       return 0;
+ }
+ static int btrfs_cleanup_transaction(struct btrfs_root *root)
+ {
+       struct btrfs_transaction *t;
+       LIST_HEAD(list);
+       WARN_ON(1);
+       mutex_lock(&root->fs_info->trans_mutex);
+       mutex_lock(&root->fs_info->transaction_kthread_mutex);
+       list_splice_init(&root->fs_info->trans_list, &list);
+       while (!list_empty(&list)) {
+               t = list_entry(list.next, struct btrfs_transaction, list);
+               if (!t)
+                       break;
+               btrfs_destroy_ordered_operations(root);
+               btrfs_destroy_ordered_extents(root);
+               btrfs_destroy_delayed_refs(t, root);
+               btrfs_block_rsv_release(root,
+                                       &root->fs_info->trans_block_rsv,
+                                       t->dirty_pages.dirty_bytes);
+               /* FIXME: cleanup wait for commit */
+               t->in_commit = 1;
+               t->blocked = 1;
+               if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+                       wake_up(&root->fs_info->transaction_blocked_wait);
+               t->blocked = 0;
+               if (waitqueue_active(&root->fs_info->transaction_wait))
+                       wake_up(&root->fs_info->transaction_wait);
+               mutex_unlock(&root->fs_info->trans_mutex);
+               mutex_lock(&root->fs_info->trans_mutex);
+               t->commit_done = 1;
+               if (waitqueue_active(&t->commit_wait))
+                       wake_up(&t->commit_wait);
+               mutex_unlock(&root->fs_info->trans_mutex);
+               mutex_lock(&root->fs_info->trans_mutex);
+               btrfs_destroy_pending_snapshots(t);
+               btrfs_destroy_delalloc_inodes(root);
+               spin_lock(&root->fs_info->new_trans_lock);
+               root->fs_info->running_transaction = NULL;
+               spin_unlock(&root->fs_info->new_trans_lock);
+               btrfs_destroy_marked_extents(root, &t->dirty_pages,
+                                            EXTENT_DIRTY);
+               btrfs_destroy_pinned_extent(root,
+                                           root->fs_info->pinned_extents);
+               t->use_count = 0;
+               list_del_init(&t->list);
+               memset(t, 0, sizeof(*t));
+               kmem_cache_free(btrfs_transaction_cachep, t);
+       }
+       mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+       mutex_unlock(&root->fs_info->trans_mutex);
+       return 0;
+ }
  static struct extent_io_ops btree_extent_io_ops = {
        .write_cache_pages_lock_hook = btree_lock_page_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
diff --combined fs/btrfs/extent-tree.c
@@@ -1746,7 -1746,8 +1746,7 @@@ static int remove_extent_backref(struc
  static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
  {
 -      blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
 -                      BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 +      blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
  }
  
  static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@@ -3089,7 -3090,7 +3089,7 @@@ static u64 get_alloc_profile(struct btr
        return btrfs_reduce_alloc_profile(root, flags);
  }
  
static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
  {
        u64 flags;
  
@@@ -3161,8 -3162,12 +3161,12 @@@ alloc
                                             bytes + 2 * 1024 * 1024,
                                             alloc_target, 0);
                        btrfs_end_transaction(trans, root);
-                       if (ret < 0)
-                               return ret;
+                       if (ret < 0) {
+                               if (ret != -ENOSPC)
+                                       return ret;
+                               else
+                                       goto commit_trans;
+                       }
  
                        if (!data_sinfo) {
                                btrfs_set_inode_space_info(root, inode);
                spin_unlock(&data_sinfo->lock);
  
                /* commit the current transaction and try again */
+ commit_trans:
                if (!committed && !root->fs_info->open_ioctl_trans) {
                        committed = 1;
                        trans = btrfs_join_transaction(root, 1);
@@@ -3721,11 -3727,6 +3726,6 @@@ int btrfs_block_rsv_check(struct btrfs_
                return 0;
        }
  
-       WARN_ON(1);
-       printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
-               block_rsv->size, block_rsv->reserved,
-               block_rsv->freed[0], block_rsv->freed[1]);
        return -ENOSPC;
  }
  
@@@ -7970,13 -7971,14 +7970,14 @@@ static int set_block_group_ro(struct bt
  
        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
            sinfo->bytes_may_use + sinfo->bytes_readonly +
-           cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+           cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
                sinfo->bytes_readonly += num_bytes;
                sinfo->bytes_reserved += cache->reserved_pinned;
                cache->reserved_pinned = 0;
                cache->ro = 1;
                ret = 0;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
        return ret;
        return ret;
  }
  
+ /*
+  * helper to account the unused space of all the readonly block group in the
+  * list. takes mirrors into account.
+  */
+ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+ {
+       struct btrfs_block_group_cache *block_group;
+       u64 free_bytes = 0;
+       int factor;
+       list_for_each_entry(block_group, groups_list, list) {
+               spin_lock(&block_group->lock);
+               if (!block_group->ro) {
+                       spin_unlock(&block_group->lock);
+                       continue;
+               }
+               if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                                         BTRFS_BLOCK_GROUP_RAID10 |
+                                         BTRFS_BLOCK_GROUP_DUP))
+                       factor = 2;
+               else
+                       factor = 1;
+               free_bytes += (block_group->key.offset -
+                              btrfs_block_group_used(&block_group->item)) *
+                              factor;
+               spin_unlock(&block_group->lock);
+       }
+       return free_bytes;
+ }
+ /*
+  * helper to account the unused space of all the readonly block group in the
+  * space_info. takes mirrors into account.
+  */
+ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+ {
+       int i;
+       u64 free_bytes = 0;
+       spin_lock(&sinfo->lock);
+       for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+               if (!list_empty(&sinfo->block_groups[i]))
+                       free_bytes += __btrfs_get_ro_block_group_free_space(
+                                               &sinfo->block_groups[i]);
+       spin_unlock(&sinfo->lock);
+       return free_bytes;
+ }
  int btrfs_set_block_group_rw(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache)
  {
@@@ -8092,7 -8150,7 +8149,7 @@@ int btrfs_can_relocate(struct btrfs_roo
        mutex_lock(&root->fs_info->chunk_mutex);
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
                u64 min_free = btrfs_block_group_used(&block_group->item);
-               u64 dev_offset, max_avail;
+               u64 dev_offset;
  
                /*
                 * check to make sure we can actually find a chunk with enough
                 */
                if (device->total_bytes > device->bytes_used + min_free) {
                        ret = find_free_dev_extent(NULL, device, min_free,
-                                                  &dev_offset, &max_avail);
+                                                  &dev_offset, NULL);
                        if (!ret)
                                break;
                        ret = -1;
        btrfs_free_path(path);
        return ret;
  }
+ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+ {
+       return unpin_extent_range(root, start, end);
+ }
+ int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+                              u64 num_bytes)
+ {
+       return btrfs_discard_extent(root, bytenr, num_bytes);
+ }
diff --combined fs/btrfs/extent_io.c
@@@ -2028,8 -2028,11 +2028,11 @@@ static int __extent_read_full_page(stru
                BUG_ON(extent_map_end(em) <= cur);
                BUG_ON(end < cur);
  
-               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                        this_bio_flag = EXTENT_BIO_COMPRESSED;
+                       extent_set_compress_type(&this_bio_flag,
+                                                em->compress_type);
+               }
  
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
                cur_end = min(extent_map_end(em) - 1, end);
@@@ -3072,10 -3075,13 +3075,12 @@@ static struct extent_buffer *__alloc_ex
  #endif
  
        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+       if (eb == NULL)
+               return NULL;
        eb->start = start;
        eb->len = len;
        spin_lock_init(&eb->lock);
        init_waitqueue_head(&eb->lock_wq);
 -      INIT_RCU_HEAD(&eb->rcu_head);
  
  #if LEAK_DEBUG
        spin_lock_irqsave(&leak_lock, flags);
diff --combined fs/btrfs/file.c
@@@ -24,7 -24,6 +24,7 @@@
  #include <linux/string.h>
  #include <linux/backing-dev.h>
  #include <linux/mpage.h>
 +#include <linux/falloc.h>
  #include <linux/swap.h>
  #include <linux/writeback.h>
  #include <linux/statfs.h>
@@@ -225,6 -224,7 +225,7 @@@ int btrfs_drop_extent_cache(struct inod
  
                        split->bdev = em->bdev;
                        split->flags = flags;
+                       split->compress_type = em->compress_type;
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret);
                        free_extent_map(split);
                        split->len = em->start + em->len - (start + len);
                        split->bdev = em->bdev;
                        split->flags = flags;
+                       split->compress_type = em->compress_type;
  
                        if (compressed) {
                                split->block_len = em->block_len;
@@@ -891,6 -892,17 +893,17 @@@ static ssize_t btrfs_file_aio_write(str
        if (err)
                goto out;
  
+       /*
+        * If BTRFS flips readonly due to some impossible error
+        * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+        * although we have opened a file as writable, we have
+        * to stop this write operation to ensure FS consistency.
+        */
+       if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+               err = -EROFS;
+               goto out;
+       }
        file_update_time(file);
        BTRFS_I(inode)->sequence++;
  
@@@ -1238,117 -1250,6 +1251,117 @@@ static int btrfs_file_mmap(struct file      
        return 0;
  }
  
 +static long btrfs_fallocate(struct file *file, int mode,
 +                          loff_t offset, loff_t len)
 +{
 +      struct inode *inode = file->f_path.dentry->d_inode;
 +      struct extent_state *cached_state = NULL;
 +      u64 cur_offset;
 +      u64 last_byte;
 +      u64 alloc_start;
 +      u64 alloc_end;
 +      u64 alloc_hint = 0;
 +      u64 locked_end;
 +      u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
 +      struct extent_map *em;
 +      int ret;
 +
 +      alloc_start = offset & ~mask;
 +      alloc_end =  (offset + len + mask) & ~mask;
 +
 +      /* We only support the FALLOC_FL_KEEP_SIZE mode */
 +      if (mode & ~FALLOC_FL_KEEP_SIZE)
 +              return -EOPNOTSUPP;
 +
 +      /*
 +       * wait for ordered IO before we have any locks.  We'll loop again
 +       * below with the locks held.
 +       */
 +      btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
 +
 +      mutex_lock(&inode->i_mutex);
 +      ret = inode_newsize_ok(inode, alloc_end);
 +      if (ret)
 +              goto out;
 +
 +      if (alloc_start > inode->i_size) {
 +              ret = btrfs_cont_expand(inode, alloc_start);
 +              if (ret)
 +                      goto out;
 +      }
 +
 +      ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
 +      if (ret)
 +              goto out;
 +
 +      locked_end = alloc_end - 1;
 +      while (1) {
 +              struct btrfs_ordered_extent *ordered;
 +
 +              /* the extent lock is ordered inside the running
 +               * transaction
 +               */
 +              lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
 +                               locked_end, 0, &cached_state, GFP_NOFS);
 +              ordered = btrfs_lookup_first_ordered_extent(inode,
 +                                                          alloc_end - 1);
 +              if (ordered &&
 +                  ordered->file_offset + ordered->len > alloc_start &&
 +                  ordered->file_offset < alloc_end) {
 +                      btrfs_put_ordered_extent(ordered);
 +                      unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 +                                           alloc_start, locked_end,
 +                                           &cached_state, GFP_NOFS);
 +                      /*
 +                       * we can't wait on the range with the transaction
 +                       * running or with the extent lock held
 +                       */
 +                      btrfs_wait_ordered_range(inode, alloc_start,
 +                                               alloc_end - alloc_start);
 +              } else {
 +                      if (ordered)
 +                              btrfs_put_ordered_extent(ordered);
 +                      break;
 +              }
 +      }
 +
 +      cur_offset = alloc_start;
 +      while (1) {
 +              em = btrfs_get_extent(inode, NULL, 0, cur_offset,
 +                                    alloc_end - cur_offset, 0);
 +              BUG_ON(IS_ERR(em) || !em);
 +              last_byte = min(extent_map_end(em), alloc_end);
 +              last_byte = (last_byte + mask) & ~mask;
 +              if (em->block_start == EXTENT_MAP_HOLE ||
 +                  (cur_offset >= inode->i_size &&
 +                   !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
 +                      ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
 +                                                      last_byte - cur_offset,
 +                                                      1 << inode->i_blkbits,
 +                                                      offset + len,
 +                                                      &alloc_hint);
 +                      if (ret < 0) {
 +                              free_extent_map(em);
 +                              break;
 +                      }
 +              }
 +              free_extent_map(em);
 +
 +              cur_offset = last_byte;
 +              if (cur_offset >= alloc_end) {
 +                      ret = 0;
 +                      break;
 +              }
 +      }
 +      unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 +                           &cached_state, GFP_NOFS);
 +
 +      btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 +out:
 +      mutex_unlock(&inode->i_mutex);
 +      return ret;
 +}
 +
  const struct file_operations btrfs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
        .open           = generic_file_open,
        .release        = btrfs_release_file,
        .fsync          = btrfs_sync_file,
 +      .fallocate      = btrfs_fallocate,
        .unlocked_ioctl = btrfs_ioctl,
  #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_ioctl,
diff --combined fs/btrfs/inode.c
@@@ -122,10 -122,10 +122,10 @@@ static noinline int insert_inline_exten
        size_t cur_size = size;
        size_t datasize;
        unsigned long offset;
-       int use_compress = 0;
+       int compress_type = BTRFS_COMPRESS_NONE;
  
        if (compressed_size && compressed_pages) {
-               use_compress = 1;
+               compress_type = root->fs_info->compress_type;
                cur_size = compressed_size;
        }
  
        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
        ptr = btrfs_file_extent_inline_start(ei);
  
-       if (use_compress) {
+       if (compress_type != BTRFS_COMPRESS_NONE) {
                struct page *cpage;
                int i = 0;
                while (compressed_size > 0) {
                        compressed_size -= cur_size;
                }
                btrfs_set_file_extent_compression(leaf, ei,
-                                                 BTRFS_COMPRESS_ZLIB);
+                                                 compress_type);
        } else {
                page = find_get_page(inode->i_mapping,
                                     start >> PAGE_CACHE_SHIFT);
@@@ -263,6 -263,7 +263,7 @@@ struct async_extent 
        u64 compressed_size;
        struct page **pages;
        unsigned long nr_pages;
+       int compress_type;
        struct list_head list;
  };
  
@@@ -280,7 -281,8 +281,8 @@@ static noinline int add_async_extent(st
                                     u64 start, u64 ram_size,
                                     u64 compressed_size,
                                     struct page **pages,
-                                    unsigned long nr_pages)
+                                    unsigned long nr_pages,
+                                    int compress_type)
  {
        struct async_extent *async_extent;
  
        async_extent->compressed_size = compressed_size;
        async_extent->pages = pages;
        async_extent->nr_pages = nr_pages;
+       async_extent->compress_type = compress_type;
        list_add_tail(&async_extent->list, &cow->extents);
        return 0;
  }
@@@ -332,6 -335,7 +335,7 @@@ static noinline int compress_file_range
        unsigned long max_uncompressed = 128 * 1024;
        int i;
        int will_compress;
+       int compress_type = root->fs_info->compress_type;
  
        actual_end = min_t(u64, isize, end + 1);
  again:
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
  
-               ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
-                                               total_compressed, pages,
-                                               nr_pages, &nr_pages_ret,
-                                               &total_in,
-                                               &total_compressed,
-                                               max_compressed);
+               if (BTRFS_I(inode)->force_compress)
+                       compress_type = BTRFS_I(inode)->force_compress;
+               ret = btrfs_compress_pages(compress_type,
+                                          inode->i_mapping, start,
+                                          total_compressed, pages,
+                                          nr_pages, &nr_pages_ret,
+                                          &total_in,
+                                          &total_compressed,
+                                          max_compressed);
  
                if (!ret) {
                        unsigned long offset = total_compressed &
                 * and will submit them to the elevator.
                 */
                add_async_extent(async_cow, start, num_bytes,
-                                total_compressed, pages, nr_pages_ret);
+                                total_compressed, pages, nr_pages_ret,
+                                compress_type);
  
                if (start + num_bytes < end) {
                        start += num_bytes;
@@@ -515,7 -524,8 +524,8 @@@ cleanup_and_bail_uncompressed
                        __set_page_dirty_nobuffers(locked_page);
                        /* unlocked later on in the async handlers */
                }
-               add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+               add_async_extent(async_cow, start, end - start + 1,
+                                0, NULL, 0, BTRFS_COMPRESS_NONE);
                *num_added += 1;
        }
  
@@@ -640,6 -650,7 +650,7 @@@ retry
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
                em->bdev = root->fs_info->fs_devices->latest_bdev;
+               em->compress_type = async_extent->compress_type;
                set_bit(EXTENT_FLAG_PINNED, &em->flags);
                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
  
                                                async_extent->ram_size - 1, 0);
                }
  
-               ret = btrfs_add_ordered_extent(inode, async_extent->start,
-                                              ins.objectid,
-                                              async_extent->ram_size,
-                                              ins.offset,
-                                              BTRFS_ORDERED_COMPRESSED);
+               ret = btrfs_add_ordered_extent_compress(inode,
+                                               async_extent->start,
+                                               ins.objectid,
+                                               async_extent->ram_size,
+                                               ins.offset,
+                                               BTRFS_ORDERED_COMPRESSED,
+                                               async_extent->compress_type);
                BUG_ON(ret);
  
                /*
@@@ -1670,7 -1683,7 +1683,7 @@@ static int btrfs_finish_ordered_io(stru
        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
-       int compressed = 0;
+       int compress_type = 0;
        int ret;
        bool nolock = false;
  
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
-               compressed = 1;
+               compress_type = ordered_extent->compress_type;
        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-               BUG_ON(compressed);
+               BUG_ON(compress_type);
                ret = btrfs_mark_extent_written(trans, inode,
                                                ordered_extent->file_offset,
                                                ordered_extent->file_offset +
                                                ordered_extent->disk_len,
                                                ordered_extent->len,
                                                ordered_extent->len,
-                                               compressed, 0, 0,
+                                               compress_type, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
                unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
                                   ordered_extent->file_offset,
@@@ -1829,6 -1842,8 +1842,8 @@@ static int btrfs_io_failed_hook(struct 
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                        logical = em->block_start;
                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+                       extent_set_compress_type(&failrec->bio_flags,
+                                                em->compress_type);
                }
                failrec->logical = logical;
                free_extent_map(em);
@@@ -3671,8 -3686,12 +3686,12 @@@ static int btrfs_setattr_size(struct in
  static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
  {
        struct inode *inode = dentry->d_inode;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
        int err;
  
+       if (btrfs_root_readonly(root))
+               return -EROFS;
        err = inode_change_ok(inode, attr);
        if (err)
                return err;
@@@ -3877,7 -3896,7 +3896,7 @@@ again
        p = &root->inode_tree.rb_node;
        parent = NULL;
  
 -      if (hlist_unhashed(&inode->i_hash))
 +      if (inode_unhashed(inode))
                return;
  
        spin_lock(&root->inode_lock);
@@@ -4084,6 -4103,8 +4103,6 @@@ struct inode *btrfs_lookup_dentry(struc
        int index;
        int ret;
  
 -      dentry->d_op = &btrfs_dentry_operations;
 -
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
  
        return inode;
  }
  
 -static int btrfs_dentry_delete(struct dentry *dentry)
 +static int btrfs_dentry_delete(const struct dentry *dentry)
  {
        struct btrfs_root *root;
  
@@@ -4800,7 -4821,7 +4819,7 @@@ static int btrfs_link(struct dentry *ol
        }
  
        btrfs_set_trans_block_group(trans, dir);
 -      atomic_inc(&inode->i_count);
 +      ihold(inode);
  
        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
  
@@@ -4928,8 -4949,10 +4947,10 @@@ static noinline int uncompress_inline(s
        size_t max_size;
        unsigned long inline_size;
        unsigned long ptr;
+       int compress_type;
  
        WARN_ON(pg_offset != 0);
+       compress_type = btrfs_file_extent_compression(leaf, item);
        max_size = btrfs_file_extent_ram_bytes(leaf, item);
        inline_size = btrfs_file_extent_inline_item_len(leaf,
                                        btrfs_item_nr(leaf, path->slots[0]));
        read_extent_buffer(leaf, tmp, ptr, inline_size);
  
        max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
-       ret = btrfs_zlib_decompress(tmp, page, extent_offset,
-                                   inline_size, max_size);
+       ret = btrfs_decompress(compress_type, tmp, page,
+                              extent_offset, inline_size, max_size);
        if (ret) {
                char *kaddr = kmap_atomic(page, KM_USER0);
                unsigned long copy_size = min_t(u64,
@@@ -4982,7 -5005,7 +5003,7 @@@ struct extent_map *btrfs_get_extent(str
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_trans_handle *trans = NULL;
-       int compressed;
+       int compress_type;
  
  again:
        read_lock(&em_tree->lock);
  
        found_type = btrfs_file_extent_type(leaf, item);
        extent_start = found_key.offset;
-       compressed = btrfs_file_extent_compression(leaf, item);
+       compress_type = btrfs_file_extent_compression(leaf, item);
        if (found_type == BTRFS_FILE_EXTENT_REG ||
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                extent_end = extent_start +
                        em->block_start = EXTENT_MAP_HOLE;
                        goto insert;
                }
-               if (compressed) {
+               if (compress_type != BTRFS_COMPRESS_NONE) {
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                       em->compress_type = compress_type;
                        em->block_start = bytenr;
                        em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
                                                                         item);
                em->len = (copy_size + root->sectorsize - 1) &
                        ~((u64)root->sectorsize - 1);
                em->orig_start = EXTENT_MAP_INLINE;
-               if (compressed)
+               if (compress_type) {
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                       em->compress_type = compress_type;
+               }
                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
                if (create == 0 && !PageUptodate(page)) {
-                       if (btrfs_file_extent_compression(leaf, item) ==
-                           BTRFS_COMPRESS_ZLIB) {
+                       if (btrfs_file_extent_compression(leaf, item) !=
+                           BTRFS_COMPRESS_NONE) {
                                ret = uncompress_inline(path, inode, page,
                                                        pg_offset,
                                                        extent_offset, item);
@@@ -6477,7 -6503,7 +6501,7 @@@ struct inode *btrfs_alloc_inode(struct 
        ei->ordered_data_close = 0;
        ei->orphan_meta_reserved = 0;
        ei->dummy_inode = 0;
-       ei->force_compress = 0;
+       ei->force_compress = BTRFS_COMPRESS_NONE;
  
        inode = &ei->vfs_inode;
        extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
        return inode;
  }
  
 +static void btrfs_i_callback(struct rcu_head *head)
 +{
 +      struct inode *inode = container_of(head, struct inode, i_rcu);
 +      INIT_LIST_HEAD(&inode->i_dentry);
 +      kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 +}
 +
  void btrfs_destroy_inode(struct inode *inode)
  {
        struct btrfs_ordered_extent *ordered;
        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
  free:
 -      kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 +      call_rcu(&inode->i_rcu, btrfs_i_callback);
  }
  
  int btrfs_drop_inode(struct inode *inode)
@@@ -7098,16 -7117,126 +7122,20 @@@ int btrfs_prealloc_file_range_trans(str
                                           min_size, actual_len, alloc_hint, trans);
  }
  
 -static long btrfs_fallocate(struct inode *inode, int mode,
 -                          loff_t offset, loff_t len)
 -{
 -      struct extent_state *cached_state = NULL;
 -      u64 cur_offset;
 -      u64 last_byte;
 -      u64 alloc_start;
 -      u64 alloc_end;
 -      u64 alloc_hint = 0;
 -      u64 locked_end;
 -      u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
 -      struct extent_map *em;
 -      int ret;
 -
 -      alloc_start = offset & ~mask;
 -      alloc_end =  (offset + len + mask) & ~mask;
 -
 -      /*
 -       * wait for ordered IO before we have any locks.  We'll loop again
 -       * below with the locks held.
 -       */
 -      btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
 -
 -      mutex_lock(&inode->i_mutex);
 -      ret = inode_newsize_ok(inode, alloc_end);
 -      if (ret)
 -              goto out;
 -
 -      if (alloc_start > inode->i_size) {
 -              ret = btrfs_cont_expand(inode, alloc_start);
 -              if (ret)
 -                      goto out;
 -      }
 -
 -      ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
 -      if (ret)
 -              goto out;
 -
 -      locked_end = alloc_end - 1;
 -      while (1) {
 -              struct btrfs_ordered_extent *ordered;
 -
 -              /* the extent lock is ordered inside the running
 -               * transaction
 -               */
 -              lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
 -                               locked_end, 0, &cached_state, GFP_NOFS);
 -              ordered = btrfs_lookup_first_ordered_extent(inode,
 -                                                          alloc_end - 1);
 -              if (ordered &&
 -                  ordered->file_offset + ordered->len > alloc_start &&
 -                  ordered->file_offset < alloc_end) {
 -                      btrfs_put_ordered_extent(ordered);
 -                      unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 -                                           alloc_start, locked_end,
 -                                           &cached_state, GFP_NOFS);
 -                      /*
 -                       * we can't wait on the range with the transaction
 -                       * running or with the extent lock held
 -                       */
 -                      btrfs_wait_ordered_range(inode, alloc_start,
 -                                               alloc_end - alloc_start);
 -              } else {
 -                      if (ordered)
 -                              btrfs_put_ordered_extent(ordered);
 -                      break;
 -              }
 -      }
 -
 -      cur_offset = alloc_start;
 -      while (1) {
 -              em = btrfs_get_extent(inode, NULL, 0, cur_offset,
 -                                    alloc_end - cur_offset, 0);
 -              BUG_ON(IS_ERR(em) || !em);
 -              last_byte = min(extent_map_end(em), alloc_end);
 -              last_byte = (last_byte + mask) & ~mask;
 -              if (em->block_start == EXTENT_MAP_HOLE ||
 -                  (cur_offset >= inode->i_size &&
 -                   !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
 -                      ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
 -                                                      last_byte - cur_offset,
 -                                                      1 << inode->i_blkbits,
 -                                                      offset + len,
 -                                                      &alloc_hint);
 -                      if (ret < 0) {
 -                              free_extent_map(em);
 -                              break;
 -                      }
 -              }
 -              free_extent_map(em);
 -
 -              cur_offset = last_byte;
 -              if (cur_offset >= alloc_end) {
 -                      ret = 0;
 -                      break;
 -              }
 -      }
 -      unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 -                           &cached_state, GFP_NOFS);
 -
 -      btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 -out:
 -      mutex_unlock(&inode->i_mutex);
 -      return ret;
 -}
 -
  static int btrfs_set_page_dirty(struct page *page)
  {
        return __set_page_dirty_nobuffers(page);
  }
  
 -static int btrfs_permission(struct inode *inode, int mask)
 +static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
  {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
+               return -EROFS;
        if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
                return -EACCES;
 -      return generic_permission(inode, mask, btrfs_check_acl);
 +      return generic_permission(inode, mask, flags, btrfs_check_acl);
  }
  
  static const struct inode_operations btrfs_dir_inode_operations = {
@@@ -7200,6 -7329,7 +7228,6 @@@ static const struct inode_operations bt
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .permission     = btrfs_permission,
 -      .fallocate      = btrfs_fallocate,
        .fiemap         = btrfs_fiemap,
  };
  static const struct inode_operations btrfs_special_inode_operations = {
diff --combined fs/btrfs/super.c
  
  static const struct super_operations btrfs_super_ops;
  
+ static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
+                                     char nbuf[16])
+ {
+       char *errstr = NULL;
+       switch (errno) {
+       case -EIO:
+               errstr = "IO failure";
+               break;
+       case -ENOMEM:
+               errstr = "Out of memory";
+               break;
+       case -EROFS:
+               errstr = "Readonly filesystem";
+               break;
+       default:
+               if (nbuf) {
+                       if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+                               errstr = nbuf;
+               }
+               break;
+       }
+       return errstr;
+ }
+ static void __save_error_info(struct btrfs_fs_info *fs_info)
+ {
+       /*
+        * today we only save the error info into ram.  Long term we'll
+        * also send it down to the disk
+        */
+       fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+ }
+ /* NOTE:
+  *    We move write_super stuff at umount in order to avoid deadlock
+  *    for umount hold all lock.
+  */
+ static void save_error_info(struct btrfs_fs_info *fs_info)
+ {
+       __save_error_info(fs_info);
+ }
+ /* btrfs handle error by forcing the filesystem readonly */
+ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
+ {
+       struct super_block *sb = fs_info->sb;
+       if (sb->s_flags & MS_RDONLY)
+               return;
+       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+               sb->s_flags |= MS_RDONLY;
+               printk(KERN_INFO "btrfs is forced readonly\n");
+       }
+ }
+ /*
+  * __btrfs_std_error decodes expected errors from the caller and
+  * invokes the approciate error response.
+  */
+ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+                    unsigned int line, int errno)
+ {
+       struct super_block *sb = fs_info->sb;
+       char nbuf[16];
+       const char *errstr;
+       /*
+        * Special case: if the error is EROFS, and we're already
+        * under MS_RDONLY, then it is safe here.
+        */
+       if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+               return;
+       errstr = btrfs_decode_error(fs_info, errno, nbuf);
+       printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+               sb->s_id, function, line, errstr);
+       save_error_info(fs_info);
+       btrfs_handle_error(fs_info);
+ }
  static void btrfs_put_super(struct super_block *sb)
  {
        struct btrfs_root *root = btrfs_sb(sb);
@@@ -69,9 -153,9 +153,9 @@@ enum 
        Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
        Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
-       Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-       Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
-       Opt_user_subvol_rm_allowed,
+       Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
+       Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
+       Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
  };
  
  static match_table_t tokens = {
        {Opt_alloc_start, "alloc_start=%s"},
        {Opt_thread_pool, "thread_pool=%d"},
        {Opt_compress, "compress"},
+       {Opt_compress_type, "compress=%s"},
        {Opt_compress_force, "compress-force"},
+       {Opt_compress_force_type, "compress-force=%s"},
        {Opt_ssd, "ssd"},
        {Opt_ssd_spread, "ssd_spread"},
        {Opt_nossd, "nossd"},
@@@ -112,6 -198,8 +198,8 @@@ int btrfs_parse_options(struct btrfs_ro
        char *p, *num, *orig;
        int intarg;
        int ret = 0;
+       char *compress_type;
+       bool compress_force = false;
  
        if (!options)
                return 0;
                        btrfs_set_opt(info->mount_opt, NODATACOW);
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
-               case Opt_compress:
-                       printk(KERN_INFO "btrfs: use compression\n");
-                       btrfs_set_opt(info->mount_opt, COMPRESS);
-                       break;
                case Opt_compress_force:
-                       printk(KERN_INFO "btrfs: forcing compression\n");
-                       btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+               case Opt_compress_force_type:
+                       compress_force = true;
+               case Opt_compress:
+               case Opt_compress_type:
+                       if (token == Opt_compress ||
+                           token == Opt_compress_force ||
+                           strcmp(args[0].from, "zlib") == 0) {
+                               compress_type = "zlib";
+                               info->compress_type = BTRFS_COMPRESS_ZLIB;
+                       } else if (strcmp(args[0].from, "lzo") == 0) {
+                               compress_type = "lzo";
+                               info->compress_type = BTRFS_COMPRESS_LZO;
+                       } else {
+                               ret = -EINVAL;
+                               goto out;
+                       }
                        btrfs_set_opt(info->mount_opt, COMPRESS);
+                       if (compress_force) {
+                               btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+                               pr_info("btrfs: force %s compression\n",
+                                       compress_type);
+                       } else
+                               pr_info("btrfs: use %s compression\n",
+                                       compress_type);
                        break;
                case Opt_ssd:
                        printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@@ -460,7 -566,6 +566,7 @@@ static int btrfs_fill_super(struct supe
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_magic = BTRFS_SUPER_MAGIC;
        sb->s_op = &btrfs_super_ops;
 +      sb->s_d_op = &btrfs_dentry_operations;
        sb->s_export_op = &btrfs_export_ops;
        sb->s_xattr = btrfs_xattr_handlers;
        sb->s_time_gran = 1;
@@@ -590,8 -695,8 +696,8 @@@ static int btrfs_set_super(struct super
   * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
   *      for multiple device setup.  Make sure to keep it in sync.
   */
 -static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 -              const char *dev_name, void *data, struct vfsmount *mnt)
 +static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 +              const char *dev_name, void *data)
  {
        struct block_device *bdev = NULL;
        struct super_block *s;
                                          &subvol_name, &subvol_objectid,
                                          &fs_devices);
        if (error)
 -              return error;
 +              return ERR_PTR(error);
  
        error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
        if (error)
                root = new_root;
        }
  
 -      mnt->mnt_sb = s;
 -      mnt->mnt_root = root;
 -
        kfree(subvol_name);
 -      return 0;
 +      return root;
  
  error_s:
        error = PTR_ERR(s);
@@@ -713,7 -821,7 +819,7 @@@ error_close_devices
        kfree(tree_root);
  error_free_subvol_name:
        kfree(subvol_name);
 -      return error;
 +      return ERR_PTR(error);
  }
  
  static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        return 0;
  }
  
+ /*
+  * The helper to calc the free space on the devices that can be used to store
+  * file data.
+  */
+ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+ {
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_device_info *devices_info;
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
+       u64 skip_space;
+       u64 type;
+       u64 avail_space;
+       u64 used_space;
+       u64 min_stripe_size;
+       int min_stripes = 1;
+       int i = 0, nr_devices;
+       int ret;
+       nr_devices = fs_info->fs_devices->rw_devices;
+       BUG_ON(!nr_devices);
+       devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+                              GFP_NOFS);
+       if (!devices_info)
+               return -ENOMEM;
+       /* calc min stripe number for data space alloction */
+       type = btrfs_get_alloc_profile(root, 1);
+       if (type & BTRFS_BLOCK_GROUP_RAID0)
+               min_stripes = 2;
+       else if (type & BTRFS_BLOCK_GROUP_RAID1)
+               min_stripes = 2;
+       else if (type & BTRFS_BLOCK_GROUP_RAID10)
+               min_stripes = 4;
+       if (type & BTRFS_BLOCK_GROUP_DUP)
+               min_stripe_size = 2 * BTRFS_STRIPE_LEN;
+       else
+               min_stripe_size = BTRFS_STRIPE_LEN;
+       list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+               if (!device->in_fs_metadata)
+                       continue;
+               avail_space = device->total_bytes - device->bytes_used;
+               /* align with stripe_len */
+               do_div(avail_space, BTRFS_STRIPE_LEN);
+               avail_space *= BTRFS_STRIPE_LEN;
+               /*
+                * In order to avoid overwritting the superblock on the drive,
+                * btrfs starts at an offset of at least 1MB when doing chunk
+                * allocation.
+                */
+               skip_space = 1024 * 1024;
+               /* user can set the offset in fs_info->alloc_start. */
+               if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+                   device->total_bytes)
+                       skip_space = max(fs_info->alloc_start, skip_space);
+               /*
+                * btrfs can not use the free space in [0, skip_space - 1],
+                * we must subtract it from the total. In order to implement
+                * it, we account the used space in this range first.
+                */
+               ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
+                                                    &used_space);
+               if (ret) {
+                       kfree(devices_info);
+                       return ret;
+               }
+               /* calc the free space in [0, skip_space - 1] */
+               skip_space -= used_space;
+               /*
+                * we can use the free space in [0, skip_space - 1], subtract
+                * it from the total.
+                */
+               if (avail_space && avail_space >= skip_space)
+                       avail_space -= skip_space;
+               else
+                       avail_space = 0;
+               if (avail_space < min_stripe_size)
+                       continue;
+               devices_info[i].dev = device;
+               devices_info[i].max_avail = avail_space;
+               i++;
+       }
+       nr_devices = i;
+       btrfs_descending_sort_devices(devices_info, nr_devices);
+       i = nr_devices - 1;
+       avail_space = 0;
+       while (nr_devices >= min_stripes) {
+               if (devices_info[i].max_avail >= min_stripe_size) {
+                       int j;
+                       u64 alloc_size;
+                       avail_space += devices_info[i].max_avail * min_stripes;
+                       alloc_size = devices_info[i].max_avail;
+                       for (j = i + 1 - min_stripes; j <= i; j++)
+                               devices_info[j].max_avail -= alloc_size;
+               }
+               i--;
+               nr_devices--;
+       }
+       kfree(devices_info);
+       *free_bytes = avail_space;
+       return 0;
+ }
  static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
  {
        struct btrfs_root *root = btrfs_sb(dentry->d_sb);
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
-       u64 total_used_data = 0;
+       u64 total_free_data = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
+       int ret;
  
+       /* holding chunk_muext to avoid allocating new chunks */
+       mutex_lock(&root->fs_info->chunk_mutex);
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
-               if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
-                                   BTRFS_BLOCK_GROUP_SYSTEM))
-                       total_used_data += found->disk_total;
-               else
-                       total_used_data += found->disk_used;
+               if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+                       total_free_data += found->disk_total - found->disk_used;
+                       total_free_data -=
+                               btrfs_account_ro_block_groups_free_space(found);
+               }
                total_used += found->disk_used;
        }
        rcu_read_unlock();
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-       buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
+       buf->f_bavail = total_free_data;
+       ret = btrfs_calc_avail_data_space(root, &total_free_data);
+       if (ret) {
+               mutex_unlock(&root->fs_info->chunk_mutex);
+               return ret;
+       }
+       buf->f_bavail += total_free_data;
+       buf->f_bavail = buf->f_bavail >> bits;
+       mutex_unlock(&root->fs_info->chunk_mutex);
  
        /* We treat it as constant endianness (it doesn't matter _which_)
           because we want the fsid to come out the same whether mounted
  static struct file_system_type btrfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "btrfs",
 -      .get_sb         = btrfs_get_sb,
 +      .mount          = btrfs_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = FS_REQUIRES_DEV,
  };
@@@ -866,7 -1107,6 +1105,7 @@@ static const struct file_operations btr
        .unlocked_ioctl  = btrfs_control_ioctl,
        .compat_ioctl = btrfs_control_ioctl,
        .owner   = THIS_MODULE,
 +      .llseek = noop_llseek,
  };
  
  static struct miscdevice btrfs_misc = {
@@@ -897,10 -1137,14 +1136,14 @@@ static int __init init_btrfs_fs(void
        if (err)
                return err;
  
-       err = btrfs_init_cachep();
+       err = btrfs_init_compress();
        if (err)
                goto free_sysfs;
  
+       err = btrfs_init_cachep();
+       if (err)
+               goto free_compress;
        err = extent_io_init();
        if (err)
                goto free_cachep;
@@@ -928,6 -1172,8 +1171,8 @@@ free_extent_io
        extent_io_exit();
  free_cachep:
        btrfs_destroy_cachep();
+ free_compress:
+       btrfs_exit_compress();
  free_sysfs:
        btrfs_exit_sysfs();
        return err;
@@@ -942,7 -1188,7 +1187,7 @@@ static void __exit exit_btrfs_fs(void
        unregister_filesystem(&btrfs_fs_type);
        btrfs_exit_sysfs();
        btrfs_cleanup_fs_uuids();
-       btrfs_zlib_exit();
+       btrfs_exit_compress();
  }
  
  module_init(init_btrfs_fs)
diff --combined fs/btrfs/volumes.c
@@@ -22,6 -22,7 +22,7 @@@
  #include <linux/blkdev.h>
  #include <linux/random.h>
  #include <linux/iocontext.h>
+ #include <linux/capability.h>
  #include <asm/div64.h>
  #include "compat.h"
  #include "ctree.h"
@@@ -398,6 -399,7 +399,6 @@@ static noinline int device_list_add(con
                device->work.func = pending_bios_fn;
                memcpy(device->uuid, disk_super->dev_item.uuid,
                       BTRFS_UUID_SIZE);
 -              device->barriers = 1;
                spin_lock_init(&device->io_lock);
                device->name = kstrdup(path, GFP_NOFS);
                if (!device->name) {
@@@ -465,6 -467,7 +466,6 @@@ static struct btrfs_fs_devices *clone_f
                device->devid = orig_dev->devid;
                device->work.func = pending_bios_fn;
                memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
 -              device->barriers = 1;
                spin_lock_init(&device->io_lock);
                INIT_LIST_HEAD(&device->dev_list);
                INIT_LIST_HEAD(&device->dev_alloc_list);
@@@ -493,7 -496,7 +494,7 @@@ again
                        continue;
  
                if (device->bdev) {
 -                      close_bdev_exclusive(device->bdev, device->mode);
 +                      blkdev_put(device->bdev, device->mode);
                        device->bdev = NULL;
                        fs_devices->open_devices--;
                }
@@@ -527,7 -530,7 +528,7 @@@ static int __btrfs_close_devices(struc
  
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
                if (device->bdev) {
 -                      close_bdev_exclusive(device->bdev, device->mode);
 +                      blkdev_put(device->bdev, device->mode);
                        fs_devices->open_devices--;
                }
                if (device->writeable) {
@@@ -584,15 -587,13 +585,15 @@@ static int __btrfs_open_devices(struct 
        int seeding = 1;
        int ret = 0;
  
 +      flags |= FMODE_EXCL;
 +
        list_for_each_entry(device, head, dev_list) {
                if (device->bdev)
                        continue;
                if (!device->name)
                        continue;
  
 -              bdev = open_bdev_exclusive(device->name, flags, holder);
 +              bdev = blkdev_get_by_path(device->name, flags, holder);
                if (IS_ERR(bdev)) {
                        printk(KERN_INFO "open %s failed\n", device->name);
                        goto error;
                set_blocksize(bdev, 4096);
  
                bh = btrfs_read_dev_super(bdev);
-               if (!bh)
+               if (!bh) {
+                       ret = -EINVAL;
                        goto error_close;
+               }
  
                disk_super = (struct btrfs_super_block *)bh->b_data;
                devid = btrfs_stack_device_id(&disk_super->dev_item);
  error_brelse:
                brelse(bh);
  error_close:
 -              close_bdev_exclusive(bdev, FMODE_READ);
 +              blkdev_put(bdev, flags);
  error:
                continue;
        }
@@@ -690,8 -693,7 +693,8 @@@ int btrfs_scan_one_device(const char *p
  
        mutex_lock(&uuid_mutex);
  
 -      bdev = open_bdev_exclusive(path, flags, holder);
 +      flags |= FMODE_EXCL;
 +      bdev = blkdev_get_by_path(path, flags, holder);
  
        if (IS_ERR(bdev)) {
                ret = PTR_ERR(bdev);
                goto error_close;
        bh = btrfs_read_dev_super(bdev);
        if (!bh) {
-               ret = -EIO;
+               ret = -EINVAL;
                goto error_close;
        }
        disk_super = (struct btrfs_super_block *)bh->b_data;
  
        brelse(bh);
  error_close:
 -      close_bdev_exclusive(bdev, flags);
 +      blkdev_put(bdev, flags);
  error:
        mutex_unlock(&uuid_mutex);
        return ret;
  }
  
+ /* helper to account the used device space in the range */
+ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+                                  u64 end, u64 *length)
+ {
+       struct btrfs_key key;
+       struct btrfs_root *root = device->dev_root;
+       struct btrfs_dev_extent *dev_extent;
+       struct btrfs_path *path;
+       u64 extent_end;
+       int ret;
+       int slot;
+       struct extent_buffer *l;
+       *length = 0;
+       if (start >= device->total_bytes)
+               return 0;
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       path->reada = 2;
+       key.objectid = device->devid;
+       key.offset = start;
+       key.type = BTRFS_DEV_EXTENT_KEY;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+       if (ret > 0) {
+               ret = btrfs_previous_item(root, path, key.objectid, key.type);
+               if (ret < 0)
+                       goto out;
+       }
+       while (1) {
+               l = path->nodes[0];
+               slot = path->slots[0];
+               if (slot >= btrfs_header_nritems(l)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret == 0)
+                               continue;
+                       if (ret < 0)
+                               goto out;
+                       break;
+               }
+               btrfs_item_key_to_cpu(l, &key, slot);
+               if (key.objectid < device->devid)
+                       goto next;
+               if (key.objectid > device->devid)
+                       break;
+               if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                       goto next;
+               dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+               extent_end = key.offset + btrfs_dev_extent_length(l,
+                                                                 dev_extent);
+               if (key.offset <= start && extent_end > end) {
+                       *length = end - start + 1;
+                       break;
+               } else if (key.offset <= start && extent_end > start)
+                       *length += extent_end - start;
+               else if (key.offset > start && extent_end <= end)
+                       *length += extent_end - key.offset;
+               else if (key.offset > start && key.offset <= end) {
+                       *length += end - key.offset + 1;
+                       break;
+               } else if (key.offset > end)
+                       break;
+ next:
+               path->slots[0]++;
+       }
+       ret = 0;
+ out:
+       btrfs_free_path(path);
+       return ret;
+ }
  /*
+  * find_free_dev_extent - find free space in the specified device
+  * @trans:    transaction handler
+  * @device:   the device which we search the free space in
+  * @num_bytes:        the size of the free space that we need
+  * @start:    store the start of the free space.
+  * @len:      the size of the free space. that we find, or the size of the max
+  *            free space if we don't find suitable free space
+  *
   * this uses a pretty simple search, the expectation is that it is
   * called very infrequently and that a given device has a small number
   * of extents
+  *
+  * @start is used to store the start of the free space if we find. But if we
+  * don't find suitable free space, it will be used to store the start position
+  * of the max free space.
+  *
+  * @len is used to store the size of the free space that we find.
+  * But if we don't find suitable free space, it is used to store the size of
+  * the max free space.
   */
  int find_free_dev_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_device *device, u64 num_bytes,
-                        u64 *start, u64 *max_avail)
+                        u64 *start, u64 *len)
  {
        struct btrfs_key key;
        struct btrfs_root *root = device->dev_root;
-       struct btrfs_dev_extent *dev_extent = NULL;
+       struct btrfs_dev_extent *dev_extent;
        struct btrfs_path *path;
-       u64 hole_size = 0;
-       u64 last_byte = 0;
-       u64 search_start = 0;
+       u64 hole_size;
+       u64 max_hole_start;
+       u64 max_hole_size;
+       u64 extent_end;
+       u64 search_start;
        u64 search_end = device->total_bytes;
        int ret;
-       int slot = 0;
-       int start_found;
+       int slot;
        struct extent_buffer *l;
  
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-       path->reada = 2;
-       start_found = 0;
        /* FIXME use last free of some kind */
  
        /* we don't want to overwrite the superblock on the drive,
         * so we make sure to start at an offset of at least 1MB
         */
-       search_start = max((u64)1024 * 1024, search_start);
+       search_start = 1024 * 1024;
  
-       if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+       if (root->fs_info->alloc_start + num_bytes <= search_end)
                search_start = max(root->fs_info->alloc_start, search_start);
  
+       max_hole_start = search_start;
+       max_hole_size = 0;
+       if (search_start >= search_end) {
+               ret = -ENOSPC;
+               goto error;
+       }
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto error;
+       }
+       path->reada = 2;
        key.objectid = device->devid;
        key.offset = search_start;
        key.type = BTRFS_DEV_EXTENT_KEY;
        ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
        if (ret < 0)
-               goto error;
+               goto out;
        if (ret > 0) {
                ret = btrfs_previous_item(root, path, key.objectid, key.type);
                if (ret < 0)
-                       goto error;
-               if (ret > 0)
-                       start_found = 1;
+                       goto out;
        }
-       l = path->nodes[0];
-       btrfs_item_key_to_cpu(l, &key, path->slots[0]);
        while (1) {
                l = path->nodes[0];
                slot = path->slots[0];
                        if (ret == 0)
                                continue;
                        if (ret < 0)
-                               goto error;
- no_more_items:
-                       if (!start_found) {
-                               if (search_start >= search_end) {
-                                       ret = -ENOSPC;
-                                       goto error;
-                               }
-                               *start = search_start;
-                               start_found = 1;
-                               goto check_pending;
-                       }
-                       *start = last_byte > search_start ?
-                               last_byte : search_start;
-                       if (search_end <= *start) {
-                               ret = -ENOSPC;
-                               goto error;
-                       }
-                       goto check_pending;
+                               goto out;
+                       break;
                }
                btrfs_item_key_to_cpu(l, &key, slot);
  
                        goto next;
  
                if (key.objectid > device->devid)
-                       goto no_more_items;
+                       break;
  
-               if (key.offset >= search_start && key.offset > last_byte &&
-                   start_found) {
-                       if (last_byte < search_start)
-                               last_byte = search_start;
-                       hole_size = key.offset - last_byte;
+               if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                       goto next;
  
-                       if (hole_size > *max_avail)
-                               *max_avail = hole_size;
+               if (key.offset > search_start) {
+                       hole_size = key.offset - search_start;
+                       if (hole_size > max_hole_size) {
+                               max_hole_start = search_start;
+                               max_hole_size = hole_size;
+                       }
  
-                       if (key.offset > last_byte &&
-                           hole_size >= num_bytes) {
-                               *start = last_byte;
-                               goto check_pending;
+                       /*
+                        * If this free space is greater than which we need,
+                        * it must be the max free space that we have found
+                        * until now, so max_hole_start must point to the start
+                        * of this free space and the length of this free space
+                        * is stored in max_hole_size. Thus, we return
+                        * max_hole_start and max_hole_size and go back to the
+                        * caller.
+                        */
+                       if (hole_size >= num_bytes) {
+                               ret = 0;
+                               goto out;
                        }
                }
-               if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
-                       goto next;
  
-               start_found = 1;
                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
-               last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+               extent_end = key.offset + btrfs_dev_extent_length(l,
+                                                                 dev_extent);
+               if (extent_end > search_start)
+                       search_start = extent_end;
  next:
                path->slots[0]++;
                cond_resched();
        }
- check_pending:
-       /* we have to make sure we didn't find an extent that has already
-        * been allocated by the map tree or the original allocation
-        */
-       BUG_ON(*start < search_start);
  
-       if (*start + num_bytes > search_end) {
-               ret = -ENOSPC;
-               goto error;
+       hole_size = search_end- search_start;
+       if (hole_size > max_hole_size) {
+               max_hole_start = search_start;
+               max_hole_size = hole_size;
        }
-       /* check for pending inserts here */
-       ret = 0;
  
- error:
+       /* See above. */
+       if (hole_size < num_bytes)
+               ret = -ENOSPC;
+       else
+               ret = 0;
+ out:
        btrfs_free_path(path);
+ error:
+       *start = max_hole_start;
+       if (len)
+               *len = max_hole_size;
        return ret;
  }
  
@@@ -1186,8 -1295,8 +1296,8 @@@ int btrfs_rm_device(struct btrfs_root *
                        goto out;
                }
        } else {
 -              bdev = open_bdev_exclusive(device_path, FMODE_READ,
 -                                    root->fs_info->bdev_holder);
 +              bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
 +                                        root->fs_info->bdev_holder);
                if (IS_ERR(bdev)) {
                        ret = PTR_ERR(bdev);
                        goto out;
                set_blocksize(bdev, 4096);
                bh = btrfs_read_dev_super(bdev);
                if (!bh) {
-                       ret = -EIO;
+                       ret = -EINVAL;
                        goto error_close;
                }
                disk_super = (struct btrfs_super_block *)bh->b_data;
                root->fs_info->fs_devices->latest_bdev = next_device->bdev;
  
        if (device->bdev) {
 -              close_bdev_exclusive(device->bdev, device->mode);
 +              blkdev_put(device->bdev, device->mode);
                device->bdev = NULL;
                device->fs_devices->open_devices--;
        }
@@@ -1297,7 -1406,7 +1407,7 @@@ error_brelse
        brelse(bh);
  error_close:
        if (bdev)
 -              close_bdev_exclusive(bdev, FMODE_READ);
 +              blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
  out:
        mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
@@@ -1449,8 -1558,7 +1559,8 @@@ int btrfs_init_new_device(struct btrfs_
        if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
                return -EINVAL;
  
 -      bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
 +      bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
 +                                root->fs_info->bdev_holder);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
  
        trans = btrfs_start_transaction(root, 0);
        lock_chunks(root);
  
 -      device->barriers = 1;
        device->writeable = 1;
        device->work.func = pending_bios_fn;
        generate_random_uuid(device->uuid);
@@@ -1576,7 -1685,7 +1686,7 @@@ out
        mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
  error:
 -      close_bdev_exclusive(bdev, 0);
 +      blkdev_put(bdev, FMODE_EXCL);
        if (seeding_dev) {
                mutex_unlock(&uuid_mutex);
                up_write(&sb->s_umount);
@@@ -1916,6 -2025,9 +2026,9 @@@ int btrfs_balance(struct btrfs_root *de
        if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
  
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
        mutex_lock(&dev_root->fs_info->volume_mutex);
        dev_root = dev_root->fs_info->dev_root;
  
@@@ -2154,66 -2266,67 +2267,67 @@@ static noinline u64 chunk_bytes_by_type
                return calc_size * num_stripes;
  }
  
- static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *extent_root,
-                              struct map_lookup **map_ret,
-                              u64 *num_bytes, u64 *stripe_size,
-                              u64 start, u64 type)
+ /* Used to sort the devices by max_avail(descending sort) */
+ int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
  {
-       struct btrfs_fs_info *info = extent_root->fs_info;
-       struct btrfs_device *device = NULL;
-       struct btrfs_fs_devices *fs_devices = info->fs_devices;
-       struct list_head *cur;
-       struct map_lookup *map = NULL;
-       struct extent_map_tree *em_tree;
-       struct extent_map *em;
-       struct list_head private_devs;
-       int min_stripe_size = 1 * 1024 * 1024;
-       u64 calc_size = 1024 * 1024 * 1024;
-       u64 max_chunk_size = calc_size;
-       u64 min_free;
-       u64 avail;
-       u64 max_avail = 0;
-       u64 dev_offset;
-       int num_stripes = 1;
-       int min_stripes = 1;
-       int sub_stripes = 0;
-       int looped = 0;
-       int ret;
-       int index;
-       int stripe_len = 64 * 1024;
+       if (((struct btrfs_device_info *)dev_info1)->max_avail >
+           ((struct btrfs_device_info *)dev_info2)->max_avail)
+               return -1;
+       else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+                ((struct btrfs_device_info *)dev_info2)->max_avail)
+               return 1;
+       else
+               return 0;
+ }
  
-       if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
-           (type & BTRFS_BLOCK_GROUP_DUP)) {
-               WARN_ON(1);
-               type &= ~BTRFS_BLOCK_GROUP_DUP;
-       }
-       if (list_empty(&fs_devices->alloc_list))
-               return -ENOSPC;
+ static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
+                                int *num_stripes, int *min_stripes,
+                                int *sub_stripes)
+ {
+       *num_stripes = 1;
+       *min_stripes = 1;
+       *sub_stripes = 0;
  
        if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-               num_stripes = fs_devices->rw_devices;
-               min_stripes = 2;
+               *num_stripes = fs_devices->rw_devices;
+               *min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-               num_stripes = 2;
-               min_stripes = 2;
+               *num_stripes = 2;
+               *min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
                if (fs_devices->rw_devices < 2)
                        return -ENOSPC;
-               num_stripes = 2;
-               min_stripes = 2;
+               *num_stripes = 2;
+               *min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-               num_stripes = fs_devices->rw_devices;
-               if (num_stripes < 4)
+               *num_stripes = fs_devices->rw_devices;
+               if (*num_stripes < 4)
                        return -ENOSPC;
-               num_stripes &= ~(u32)1;
-               sub_stripes = 2;
-               min_stripes = 4;
+               *num_stripes &= ~(u32)1;
+               *sub_stripes = 2;
+               *min_stripes = 4;
        }
  
+       return 0;
+ }
+ static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
+                                   u64 proposed_size, u64 type,
+                                   int num_stripes, int small_stripe)
+ {
+       int min_stripe_size = 1 * 1024 * 1024;
+       u64 calc_size = proposed_size;
+       u64 max_chunk_size = calc_size;
+       int ncopies = 1;
+       if (type & (BTRFS_BLOCK_GROUP_RAID1 |
+                   BTRFS_BLOCK_GROUP_DUP |
+                   BTRFS_BLOCK_GROUP_RAID10))
+               ncopies = 2;
        if (type & BTRFS_BLOCK_GROUP_DATA) {
                max_chunk_size = 10 * calc_size;
                min_stripe_size = 64 * 1024 * 1024;
        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
                             max_chunk_size);
  
- again:
-       max_avail = 0;
-       if (!map || map->num_stripes != num_stripes) {
-               kfree(map);
-               map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
-               if (!map)
-                       return -ENOMEM;
-               map->num_stripes = num_stripes;
-       }
-       if (calc_size * num_stripes > max_chunk_size) {
-               calc_size = max_chunk_size;
+       if (calc_size * num_stripes > max_chunk_size * ncopies) {
+               calc_size = max_chunk_size * ncopies;
                do_div(calc_size, num_stripes);
-               do_div(calc_size, stripe_len);
-               calc_size *= stripe_len;
+               do_div(calc_size, BTRFS_STRIPE_LEN);
+               calc_size *= BTRFS_STRIPE_LEN;
        }
  
        /* we don't want tiny stripes */
-       if (!looped)
+       if (!small_stripe)
                calc_size = max_t(u64, min_stripe_size, calc_size);
  
        /*
-        * we're about to do_div by the stripe_len so lets make sure
+        * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
         * we end up with something bigger than a stripe
         */
-       calc_size = max_t(u64, calc_size, stripe_len * 4);
+       calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
+       do_div(calc_size, BTRFS_STRIPE_LEN);
+       calc_size *= BTRFS_STRIPE_LEN;
+       return calc_size;
+ }
+ static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
+                                                     int num_stripes)
+ {
+       struct map_lookup *new;
+       size_t len = map_lookup_size(num_stripes);
+       BUG_ON(map->num_stripes < num_stripes);
+       if (map->num_stripes == num_stripes)
+               return map;
+       new = kmalloc(len, GFP_NOFS);
+       if (!new) {
+               /* just change map->num_stripes */
+               map->num_stripes = num_stripes;
+               return map;
+       }
+       memcpy(new, map, len);
+       new->num_stripes = num_stripes;
+       kfree(map);
+       return new;
+ }
+ /*
+  * helper to allocate device space from btrfs_device_info, in which we stored
+  * max free space information of every device. It is used when we can not
+  * allocate chunks by default size.
+  *
+  * By this helper, we can allocate a new chunk as larger as possible.
+  */
+ static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
+                                   struct btrfs_fs_devices *fs_devices,
+                                   struct btrfs_device_info *devices,
+                                   int nr_device, u64 type,
+                                   struct map_lookup **map_lookup,
+                                   int min_stripes, u64 *stripe_size)
+ {
+       int i, index, sort_again = 0;
+       int min_devices = min_stripes;
+       u64 max_avail, min_free;
+       struct map_lookup *map = *map_lookup;
+       int ret;
+       if (nr_device < min_stripes)
+               return -ENOSPC;
+       btrfs_descending_sort_devices(devices, nr_device);
+       max_avail = devices[0].max_avail;
+       if (!max_avail)
+               return -ENOSPC;
+       for (i = 0; i < nr_device; i++) {
+               /*
+                * if dev_offset = 0, it means the free space of this device
+                * is less than what we need, and we didn't search max avail
+                * extent on this device, so do it now.
+                */
+               if (!devices[i].dev_offset) {
+                       ret = find_free_dev_extent(trans, devices[i].dev,
+                                                  max_avail,
+                                                  &devices[i].dev_offset,
+                                                  &devices[i].max_avail);
+                       if (ret != 0 && ret != -ENOSPC)
+                               return ret;
+                       sort_again = 1;
+               }
+       }
+       /* we update the max avail free extent of each devices, sort again */
+       if (sort_again)
+               btrfs_descending_sort_devices(devices, nr_device);
+       if (type & BTRFS_BLOCK_GROUP_DUP)
+               min_devices = 1;
+       if (!devices[min_devices - 1].max_avail)
+               return -ENOSPC;
+       max_avail = devices[min_devices - 1].max_avail;
+       if (type & BTRFS_BLOCK_GROUP_DUP)
+               do_div(max_avail, 2);
+       max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
+                                            min_stripes, 1);
+       if (type & BTRFS_BLOCK_GROUP_DUP)
+               min_free = max_avail * 2;
+       else
+               min_free = max_avail;
+       if (min_free > devices[min_devices - 1].max_avail)
+               return -ENOSPC;
+       map = __shrink_map_lookup_stripes(map, min_stripes);
+       *stripe_size = max_avail;
+       index = 0;
+       for (i = 0; i < min_stripes; i++) {
+               map->stripes[i].dev = devices[index].dev;
+               map->stripes[i].physical = devices[index].dev_offset;
+               if (type & BTRFS_BLOCK_GROUP_DUP) {
+                       i++;
+                       map->stripes[i].dev = devices[index].dev;
+                       map->stripes[i].physical = devices[index].dev_offset +
+                                                  max_avail;
+               }
+               index++;
+       }
+       *map_lookup = map;
  
-       do_div(calc_size, stripe_len);
-       calc_size *= stripe_len;
+       return 0;
+ }
+ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *extent_root,
+                              struct map_lookup **map_ret,
+                              u64 *num_bytes, u64 *stripe_size,
+                              u64 start, u64 type)
+ {
+       struct btrfs_fs_info *info = extent_root->fs_info;
+       struct btrfs_device *device = NULL;
+       struct btrfs_fs_devices *fs_devices = info->fs_devices;
+       struct list_head *cur;
+       struct map_lookup *map;
+       struct extent_map_tree *em_tree;
+       struct extent_map *em;
+       struct btrfs_device_info *devices_info;
+       struct list_head private_devs;
+       u64 calc_size = 1024 * 1024 * 1024;
+       u64 min_free;
+       u64 avail;
+       u64 dev_offset;
+       int num_stripes;
+       int min_stripes;
+       int sub_stripes;
+       int min_devices;        /* the min number of devices we need */
+       int i;
+       int ret;
+       int index;
+       if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+           (type & BTRFS_BLOCK_GROUP_DUP)) {
+               WARN_ON(1);
+               type &= ~BTRFS_BLOCK_GROUP_DUP;
+       }
+       if (list_empty(&fs_devices->alloc_list))
+               return -ENOSPC;
+       ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
+                                   &min_stripes, &sub_stripes);
+       if (ret)
+               return ret;
+       devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+                              GFP_NOFS);
+       if (!devices_info)
+               return -ENOMEM;
+       map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+       if (!map) {
+               ret = -ENOMEM;
+               goto error;
+       }
+       map->num_stripes = num_stripes;
  
        cur = fs_devices->alloc_list.next;
        index = 0;
+       i = 0;
  
-       if (type & BTRFS_BLOCK_GROUP_DUP)
+       calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
+                                            num_stripes, 0);
+       if (type & BTRFS_BLOCK_GROUP_DUP) {
                min_free = calc_size * 2;
-       else
+               min_devices = 1;
+       } else {
                min_free = calc_size;
-       /*
-        * we add 1MB because we never use the first 1MB of the device, unless
-        * we've looped, then we are likely allocating the maximum amount of
-        * space left already
-        */
-       if (!looped)
-               min_free += 1024 * 1024;
+               min_devices = min_stripes;
+       }
  
        INIT_LIST_HEAD(&private_devs);
        while (index < num_stripes) {
                cur = cur->next;
  
                if (device->in_fs_metadata && avail >= min_free) {
-                       ret = find_free_dev_extent(trans, device,
-                                                  min_free, &dev_offset,
-                                                  &max_avail);
+                       ret = find_free_dev_extent(trans, device, min_free,
+                                                  &devices_info[i].dev_offset,
+                                                  &devices_info[i].max_avail);
                        if (ret == 0) {
                                list_move_tail(&device->dev_alloc_list,
                                               &private_devs);
                                map->stripes[index].dev = device;
-                               map->stripes[index].physical = dev_offset;
+                               map->stripes[index].physical =
+                                               devices_info[i].dev_offset;
                                index++;
                                if (type & BTRFS_BLOCK_GROUP_DUP) {
                                        map->stripes[index].dev = device;
                                        map->stripes[index].physical =
-                                               dev_offset + calc_size;
+                                               devices_info[i].dev_offset +
+                                               calc_size;
                                        index++;
                                }
-                       }
-               } else if (device->in_fs_metadata && avail > max_avail)
-                       max_avail = avail;
+                       } else if (ret != -ENOSPC)
+                               goto error;
+                       devices_info[i].dev = device;
+                       i++;
+               } else if (device->in_fs_metadata &&
+                          avail >= BTRFS_STRIPE_LEN) {
+                       devices_info[i].dev = device;
+                       devices_info[i].max_avail = avail;
+                       i++;
+               }
                if (cur == &fs_devices->alloc_list)
                        break;
        }
        list_splice(&private_devs, &fs_devices->alloc_list);
        if (index < num_stripes) {
                if (index >= min_stripes) {
                                num_stripes /= sub_stripes;
                                num_stripes *= sub_stripes;
                        }
-                       looped = 1;
-                       goto again;
-               }
-               if (!looped && max_avail > 0) {
-                       looped = 1;
-                       calc_size = max_avail;
-                       goto again;
+                       map = __shrink_map_lookup_stripes(map, num_stripes);
+               } else if (i >= min_devices) {
+                       ret = __btrfs_alloc_tiny_space(trans, fs_devices,
+                                                      devices_info, i, type,
+                                                      &map, min_stripes,
+                                                      &calc_size);
+                       if (ret)
+                               goto error;
+               } else {
+                       ret = -ENOSPC;
+                       goto error;
                }
-               kfree(map);
-               return -ENOSPC;
        }
        map->sector_size = extent_root->sectorsize;
-       map->stripe_len = stripe_len;
-       map->io_align = stripe_len;
-       map->io_width = stripe_len;
+       map->stripe_len = BTRFS_STRIPE_LEN;
+       map->io_align = BTRFS_STRIPE_LEN;
+       map->io_width = BTRFS_STRIPE_LEN;
        map->type = type;
-       map->num_stripes = num_stripes;
        map->sub_stripes = sub_stripes;
  
        *map_ret = map;
        *stripe_size = calc_size;
        *num_bytes = chunk_bytes_by_type(type, calc_size,
-                                        num_stripes, sub_stripes);
+                                        map->num_stripes, sub_stripes);
  
        em = alloc_extent_map(GFP_NOFS);
        if (!em) {
-               kfree(map);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto error;
        }
        em->bdev = (struct block_device *)map;
        em->start = start;
                index++;
        }
  
+       kfree(devices_info);
        return 0;
+ error:
+       kfree(map);
+       kfree(devices_info);
+       return ret;
  }
  
  static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
@@@ -3087,6 -3378,7 +3379,6 @@@ static struct btrfs_device *add_missing
                return NULL;
        list_add(&device->dev_list,
                 &fs_devices->devices);
 -      device->barriers = 1;
        device->dev_root = root->fs_info->dev_root;
        device->devid = devid;
        device->work.func = pending_bios_fn;
diff --combined fs/btrfs/volumes.h
  #define __BTRFS_VOLUMES_
  
  #include <linux/bio.h>
+ #include <linux/sort.h>
  #include "async-thread.h"
  
+ #define BTRFS_STRIPE_LEN      (64 * 1024)
  struct buffer_head;
  struct btrfs_pending_bios {
        struct bio *head;
@@@ -42,6 -45,7 +45,6 @@@ struct btrfs_device 
        int running_pending;
        u64 generation;
  
 -      int barriers;
        int writeable;
        int in_fs_metadata;
        int missing;
@@@ -50,7 -54,7 +53,7 @@@
  
        struct block_device *bdev;
  
 -      /* the mode sent to open_bdev_exclusive */
 +      /* the mode sent to blkdev_get */
        fmode_t mode;
  
        char *name;
@@@ -136,6 -140,30 +139,30 @@@ struct btrfs_multi_bio 
        struct btrfs_bio_stripe stripes[];
  };
  
+ struct btrfs_device_info {
+       struct btrfs_device *dev;
+       u64 dev_offset;
+       u64 max_avail;
+ };
+ /* Used to sort the devices by max_avail(descending sort) */
+ int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
+ /*
+  * sort the devices by max_avail, in which max free extent size of each device
+  * is stored.(Descending Sort)
+  */
+ static inline void btrfs_descending_sort_devices(
+                                       struct btrfs_device_info *devices,
+                                       size_t nr_devices)
+ {
+       sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+            btrfs_cmp_device_free_bytes, NULL);
+ }
+ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+                                  u64 end, u64 *length);
  #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
                            (sizeof(struct btrfs_bio_stripe) * (n)))