Merge branch 'for-linus' into raid56-experimental

author Chris Mason <chris.mason@fusionio.com>

Tue, 5 Feb 2013 15:04:03 +0000 (10:04 -0500)

committer Chris Mason <chris.mason@fusionio.com>

Tue, 5 Feb 2013 15:04:03 +0000 (10:04 -0500)
author Chris Mason <chris.mason@fusionio.com>
Tue, 5 Feb 2013 15:04:03 +0000 (10:04 -0500)
committer Chris Mason <chris.mason@fusionio.com>
Tue, 5 Feb 2013 15:04:03 +0000 (10:04 -0500)
diff --combined fs/btrfs/extent-tree.c

index 87b0e85,85b8454..7e801ad
--- 1/fs/btrfs/extent-tree.c
--- 2/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@@ -31,7 -31,6 +31,7 @@@
   #include "print-tree.h"
   #include "transaction.h"
   #include "volumes.h"
+ +#include "raid56.h"
   #include "locking.h"
   #include "free-space-cache.h"
   #include "math.h"
@@@ -1853,8 -1852,6 +1853,8 @@@ static int btrfs_discard_extent(struct 
                 *actual_bytes = discarded_bytes;
   
   
+ +      if (ret == -EOPNOTSUPP)
+ +              ret = 0;
         return ret;
   }
   
@@@ -2438,16 -2435,6 +2438,16 @@@ int btrfs_delayed_refs_qgroup_accountin
         return ret;
   }
   
+ +static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
+ +                    int count)
+ +{
+ +      int val = atomic_read(&delayed_refs->ref_seq);
+ +
+ +      if (val < seq || val >= seq + count)
+ +              return 1;
+ +      return 0;
+ +}
+ +
   /*
    * this starts processing the delayed reference count updates and
    * extent insertions we have queued up so far.  count can be
@@@ -2482,44 -2469,6 +2482,44 @@@ int btrfs_run_delayed_refs(struct btrfs
   
         delayed_refs = &trans->transaction->delayed_refs;
         INIT_LIST_HEAD(&cluster);
+ +      if (count == 0) {
+ +              count = delayed_refs->num_entries * 2;
+ +              run_most = 1;
+ +      }
+ +
+ +      if (!run_all && !run_most) {
+ +              int old;
+ +              int seq = atomic_read(&delayed_refs->ref_seq);
+ +
+ +progress:
+ +              old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+ +              if (old) {
+ +                      DEFINE_WAIT(__wait);
+ +                      if (delayed_refs->num_entries < 16348)
+ +                              return 0;
+ +
+ +                      prepare_to_wait(&delayed_refs->wait, &__wait,
+ +                                      TASK_UNINTERRUPTIBLE);
+ +
+ +                      old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+ +                      if (old) {
+ +                              schedule();
+ +                              finish_wait(&delayed_refs->wait, &__wait);
+ +
+ +                              if (!refs_newer(delayed_refs, seq, 256))
+ +                                      goto progress;
+ +                              else
+ +                                      return 0;
+ +                      } else {
+ +                              finish_wait(&delayed_refs->wait, &__wait);
+ +                              goto again;
+ +                      }
+ +              }
+ +
+ +      } else {
+ +              atomic_inc(&delayed_refs->procs_running_refs);
+ +      }
+ +
   again:
         loops = 0;
         spin_lock(&delayed_refs->lock);
@@@ -2528,6 -2477,10 +2528,6 @@@
         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
   #endif
   
- -      if (count == 0) {
- -              count = delayed_refs->num_entries * 2;
- -              run_most = 1;
- -      }
         while (1) {
                 if (!(run_all || run_most) &&
                     delayed_refs->num_heads_ready < 64)
@@@ -2549,12 -2502,9 +2549,12 @@@
                 if (ret < 0) {
                         spin_unlock(&delayed_refs->lock);
                         btrfs_abort_transaction(trans, root, ret);
+ +                      atomic_dec(&delayed_refs->procs_running_refs);
                         return ret;
                 }
   
+ +              atomic_add(ret, &delayed_refs->ref_seq);
+ +
                 count -= min_t(unsigned long, ret, count);
   
                 if (count == 0)
@@@ -2623,11 -2573,6 +2623,11 @@@
                 goto again;
         }
   out:
+ +      atomic_dec(&delayed_refs->procs_running_refs);
+ +      smp_mb();
+ +      if (waitqueue_active(&delayed_refs->wait))
+ +              wake_up(&delayed_refs->wait);
+ +
         spin_unlock(&delayed_refs->lock);
         assert_qgroups_uptodate(trans);
         return 0;
@@@ -3331,7 -3276,6 +3331,7 @@@ u64 btrfs_reduce_alloc_profile(struct b
         u64 num_devices = root->fs_info->fs_devices->rw_devices +
                 root->fs_info->fs_devices->missing_devices;
         u64 target;
+ +      u64 tmp;
   
         /*
          * see if restripe for this chunk_type is in progress, if so
@@@ -3348,32 -3292,30 +3348,32 @@@
         }
         spin_unlock(&root->fs_info->balance_lock);
   
+ +      /* First, mask out the RAID levels which aren't possible */
         if (num_devices == 1)
- -              flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+ +              flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
+ +                         BTRFS_BLOCK_GROUP_RAID5);
+ +      if (num_devices < 3)
+ +              flags &= ~BTRFS_BLOCK_GROUP_RAID6;
         if (num_devices < 4)
                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
   
- -      if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
- -          (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- -                    BTRFS_BLOCK_GROUP_RAID10))) {
- -              flags &= ~BTRFS_BLOCK_GROUP_DUP;
- -      }
- -
- -      if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
- -          (flags & BTRFS_BLOCK_GROUP_RAID10)) {
- -              flags &= ~BTRFS_BLOCK_GROUP_RAID1;
- -      }
+ +      tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+ +                     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
+ +                     BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
+ +      flags &= ~tmp;
   
- -      if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
- -          ((flags & BTRFS_BLOCK_GROUP_RAID1) |
- -           (flags & BTRFS_BLOCK_GROUP_RAID10) |
- -           (flags & BTRFS_BLOCK_GROUP_DUP))) {
- -              flags &= ~BTRFS_BLOCK_GROUP_RAID0;
- -      }
+ +      if (tmp & BTRFS_BLOCK_GROUP_RAID6)
+ +              tmp = BTRFS_BLOCK_GROUP_RAID6;
+ +      else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
+ +              tmp = BTRFS_BLOCK_GROUP_RAID5;
+ +      else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
+ +              tmp = BTRFS_BLOCK_GROUP_RAID10;
+ +      else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
+ +              tmp = BTRFS_BLOCK_GROUP_RAID1;
+ +      else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
+ +              tmp = BTRFS_BLOCK_GROUP_RAID0;
   
- -      return extended_to_chunk(flags);
+ +      return extended_to_chunk(flags | tmp);
   }
   
   static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
@@@ -3391,7 -3333,6 +3391,7 @@@
   u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
   {
         u64 flags;
+ +      u64 ret;
   
         if (data)
                 flags = BTRFS_BLOCK_GROUP_DATA;
@@@ -3400,8 -3341,7 +3400,8 @@@
         else
                 flags = BTRFS_BLOCK_GROUP_METADATA;
   
- -      return get_alloc_profile(root, flags);
+ +      ret = get_alloc_profile(root, flags);
+ +      return ret;
   }
   
   /*
@@@ -3576,10 -3516,8 +3576,10 @@@ static u64 get_system_chunk_thresh(stru
   {
         u64 num_dev;
   
- -      if (type & BTRFS_BLOCK_GROUP_RAID10 ||
- -          type & BTRFS_BLOCK_GROUP_RAID0)
+ +      if (type & (BTRFS_BLOCK_GROUP_RAID10 |
+ +                  BTRFS_BLOCK_GROUP_RAID0 |
+ +                  BTRFS_BLOCK_GROUP_RAID5 |
+ +                  BTRFS_BLOCK_GROUP_RAID6))
                 num_dev = root->fs_info->fs_devices->rw_devices;
         else if (type & BTRFS_BLOCK_GROUP_RAID1)
                 num_dev = 2;
@@@ -3729,9 -3667,7 +3729,9 @@@ static int can_overcommit(struct btrfs_
   
         /*
          * If we have dup, raid1 or raid10 then only half of the free
- -       * space is actually useable.
+ +       * space is actually useable.  For raid56, the space info used
+ +       * doesn't include the parity drive, so we don't have to
+ +       * change the math
          */
         if (profile & (BTRFS_BLOCK_GROUP_DUP |
                        BTRFS_BLOCK_GROUP_RAID1 |
@@@ -4061,7 -3997,7 +4061,7 @@@ again
          * We make the other tasks wait for the flush only when we can flush
          * all things.
          */
-       if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
+       if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
                 flushing = true;
                 space_info->flush = 1;
         }
@@@ -5519,14 -5455,10 +5519,14 @@@ int btrfs_free_extent(struct btrfs_tran
         return ret;
   }
   
- -static u64 stripe_align(struct btrfs_root *root, u64 val)
+ +static u64 stripe_align(struct btrfs_root *root,
+ +                      struct btrfs_block_group_cache *cache,
+ +                      u64 val, u64 num_bytes)
   {
- -      u64 mask = ((u64)root->stripesize - 1);
- -      u64 ret = (val + mask) & ~mask;
+ +      u64 mask;
+ +      u64 ret;
+ +      mask = ((u64)root->stripesize - 1);
+ +      ret = (val + mask) & ~mask;
         return ret;
   }
   
@@@ -5587,12 -5519,9 +5587,12 @@@ int __get_raid_index(u64 flags
                 index = 2;
         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
                 index = 3;
+ +      else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+ +              index = 5;
+ +      else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+ +              index = 6;
         else
- -              index = 4;
- -
+ +              index = 4; /* BTRFS_BLOCK_GROUP_SINGLE */
         return index;
   }
   
@@@ -5631,7 -5560,7 +5631,7 @@@ static noinline int find_free_extent(st
         int empty_cluster = 2 * 1024 * 1024;
         struct btrfs_space_info *space_info;
         int loop = 0;
-       int index = 0;
+       int index = __get_raid_index(data);
         int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
                 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
         bool found_uncached_bg = false;
@@@ -5736,8 -5665,6 +5736,8 @@@ search
                 if (!block_group_bits(block_group, data)) {
                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
                                 BTRFS_BLOCK_GROUP_RAID1 |
+ +                              BTRFS_BLOCK_GROUP_RAID5 |
+ +                              BTRFS_BLOCK_GROUP_RAID6 |
                                 BTRFS_BLOCK_GROUP_RAID10;
   
                         /*
@@@ -5767,7 -5694,6 +5767,7 @@@ have_block_group
                  * lets look there
                  */
                 if (last_ptr) {
+ +                      unsigned long aligned_cluster;
                         /*
                          * the refill lock keeps out other
                          * people trying to start a new cluster
@@@ -5834,15 -5760,11 +5834,15 @@@ refill_cluster
                                 goto unclustered_alloc;
                         }
   
+ +                      aligned_cluster = max_t(unsigned long,
+ +                                              empty_cluster + empty_size,
+ +                                            block_group->full_stripe_len);
+ +
                         /* allocate a cluster in this block group */
                         ret = btrfs_find_space_cluster(trans, root,
                                                block_group, last_ptr,
                                                search_start, num_bytes,
- -                                             empty_cluster + empty_size);
+ +                                             aligned_cluster);
                         if (ret == 0) {
                                 /*
                                  * now pull our allocation out of this
@@@ -5913,8 -5835,7 +5913,8 @@@ unclustered_alloc
                         goto loop;
                 }
   checks:
- -              search_start = stripe_align(root, offset);
+ +              search_start = stripe_align(root, used_block_group,
+ +                                          offset, num_bytes);
   
                 /* move on to the next group */
                 if (search_start + num_bytes >
@@@ -6867,11 -6788,13 +6867,13 @@@ static noinline int walk_up_proc(struc
                                                        &wc->flags[level]);
                         if (ret < 0) {
                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
+                               path->locks[level] = 0;
                                 return ret;
                         }
                         BUG_ON(wc->refs[level] == 0);
                         if (wc->refs[level] == 1) {
                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
+                               path->locks[level] = 0;
                                 return 1;
                         }
                 }
@@@ -7282,7 -7205,6 +7284,7 @@@ static u64 update_block_group_flags(str
                 root->fs_info->fs_devices->missing_devices;
   
         stripped = BTRFS_BLOCK_GROUP_RAID0 |
+ +              BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
   
         if (num_devices == 1) {
@@@ -7834,9 -7756,7 +7836,9 @@@ int btrfs_read_block_groups(struct btrf
                 btrfs_release_path(path);
                 cache->flags = btrfs_block_group_flags(&cache->item);
                 cache->sectorsize = root->sectorsize;
- -
+ +              cache->full_stripe_len = btrfs_full_stripe_len(root,
+ +                                             &root->fs_info->mapping_tree,
+ +                                             found_key.objectid);
                 btrfs_init_free_space_ctl(cache);
   
                 /*
@@@ -7890,8 -7810,6 +7892,8 @@@
                 if (!(get_alloc_profile(root, space_info->flags) &
                       (BTRFS_BLOCK_GROUP_RAID10 |
                        BTRFS_BLOCK_GROUP_RAID1 |
+ +                     BTRFS_BLOCK_GROUP_RAID5 |
+ +                     BTRFS_BLOCK_GROUP_RAID6 |
                        BTRFS_BLOCK_GROUP_DUP)))
                         continue;
                 /*
@@@ -7967,9 -7885,6 +7969,9 @@@ int btrfs_make_block_group(struct btrfs
         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
         cache->sectorsize = root->sectorsize;
         cache->fs_info = root->fs_info;
+ +      cache->full_stripe_len = btrfs_full_stripe_len(root,
+ +                                             &root->fs_info->mapping_tree,
+ +                                             chunk_offset);
   
         atomic_set(&cache->count, 1);
         spin_lock_init(&cache->lock);
diff --combined fs/btrfs/free-space-cache.c

index 62020b7,0be7a87..e067cae
--- 1/fs/btrfs/free-space-cache.c
--- 2/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@@ -1463,14 -1463,10 +1463,14 @@@ static int search_bitmap(struct btrfs_f
   }
   
   static struct btrfs_free_space *
- -find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
+ +find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+ +              unsigned long align)
   {
         struct btrfs_free_space *entry;
         struct rb_node *node;
+ +      u64 ctl_off;
+ +      u64 tmp;
+ +      u64 align_off;
         int ret;
   
         if (!ctl->free_space_offset.rb_node)
@@@ -1485,34 -1481,15 +1485,34 @@@
                 if (entry->bytes < *bytes)
                         continue;
   
+ +              /* make sure the space returned is big enough
+ +               * to match our requested alignment
+ +               */
+ +              if (*bytes >= align) {
+ +                      ctl_off = entry->offset - ctl->start;
+ +                      tmp = ctl_off + align - 1;;
+ +                      do_div(tmp, align);
+ +                      tmp = tmp * align + ctl->start;
+ +                      align_off = tmp - entry->offset;
+ +              } else {
+ +                      align_off = 0;
+ +                      tmp = entry->offset;
+ +              }
+ +
+ +              if (entry->bytes < *bytes + align_off)
+ +                      continue;
+ +
                 if (entry->bitmap) {
- -                      ret = search_bitmap(ctl, entry, offset, bytes);
- -                      if (!ret)
+ +                      ret = search_bitmap(ctl, entry, &tmp, bytes);
+ +                      if (!ret) {
+ +                              *offset = tmp;
                                 return entry;
+ +                      }
                         continue;
                 }
   
- -              *offset = entry->offset;
- -              *bytes = entry->bytes;
+ +              *offset = tmp;
+ +              *bytes = entry->bytes - align_off;
                 return entry;
         }
   
@@@ -1885,11 -1862,13 +1885,13 @@@ int btrfs_remove_free_space(struct btrf
   {
         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
         struct btrfs_free_space *info;
-       int ret = 0;
+       int ret;
+       bool re_search = false;
   
         spin_lock(&ctl->tree_lock);
   
   again:
+       ret = 0;
         if (!bytes)
                 goto out_lock;
   
@@@ -1902,17 -1881,17 +1904,17 @@@
                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
                                           1, 0);
                 if (!info) {
-                       /* the tree logging code might be calling us before we
-                        * have fully loaded the free space rbtree for this
-                        * block group.  So it is possible the entry won't
-                        * be in the rbtree yet at all.  The caching code
-                        * will make sure not to put it in the rbtree if
-                        * the logging code has pinned it.
+                       /*
+                        * If we found a partial bit of our free space in a
+                        * bitmap but then couldn't find the other part this may
+                        * be a problem, so WARN about it.
                          */
+                       WARN_ON(re_search);
                         goto out_lock;
                 }
         }
   
+       re_search = false;
         if (!info->bitmap) {
                 unlink_free_space(ctl, info);
                 if (offset == info->offset) {
@@@ -1958,8 -1937,10 +1960,10 @@@
         }
   
         ret = remove_from_bitmap(ctl, info, &offset, &bytes);
-       if (ret == -EAGAIN)
+       if (ret == -EAGAIN) {
+               re_search = true;
                 goto again;
+       }
         BUG_ON(ret); /* logic error */
   out_lock:
         spin_unlock(&ctl->tree_lock);
@@@ -2114,12 -2095,9 +2118,12 @@@ u64 btrfs_find_space_for_alloc(struct b
         struct btrfs_free_space *entry = NULL;
         u64 bytes_search = bytes + empty_size;
         u64 ret = 0;
+ +      u64 align_gap = 0;
+ +      u64 align_gap_len = 0;
   
         spin_lock(&ctl->tree_lock);
- -      entry = find_free_space(ctl, &offset, &bytes_search);
+ +      entry = find_free_space(ctl, &offset, &bytes_search,
+ +                              block_group->full_stripe_len);
         if (!entry)
                 goto out;
   
@@@ -2129,15 -2107,9 +2133,15 @@@
                 if (!entry->bytes)
                         free_bitmap(ctl, entry);
         } else {
+ +
                 unlink_free_space(ctl, entry);
- -              entry->offset += bytes;
- -              entry->bytes -= bytes;
+ +              align_gap_len = offset - entry->offset;
+ +              align_gap = entry->offset;
+ +
+ +              entry->offset = offset + bytes;
+ +              WARN_ON(entry->bytes < bytes + align_gap_len);
+ +
+ +              entry->bytes -= bytes + align_gap_len;
                 if (!entry->bytes)
                         kmem_cache_free(btrfs_free_space_cachep, entry);
                 else
@@@ -2147,8 -2119,6 +2151,8 @@@
   out:
         spin_unlock(&ctl->tree_lock);
   
+ +      if (align_gap_len)
+ +              __btrfs_add_free_space(ctl, align_gap, align_gap_len);
         return ret;
   }
   
diff --combined fs/btrfs/inode.c

index 6f4e41d,ca7ace7..492ee0e
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -39,7 -39,6 +39,7 @@@
   #include <linux/slab.h>
   #include <linux/ratelimit.h>
   #include <linux/mount.h>
+ +#include <linux/blkdev.h>
   #include "compat.h"
   #include "ctree.h"
   #include "disk-io.h"
@@@ -89,7 -88,7 +89,7 @@@ static unsigned char btrfs_type_by_mode
         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
   };
   
- static int btrfs_setsize(struct inode *inode, loff_t newsize);
+ static int btrfs_setsize(struct inode *inode, struct iattr *attr);
   static int btrfs_truncate(struct inode *inode);
   static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
   static noinline int cow_file_range(struct inode *inode,
@@@ -1567,7 -1566,7 +1567,7 @@@ static void btrfs_clear_bit_hook(struc
    * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
    * we don't create bios that span stripes or chunks
    */
- -int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+ +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
                          size_t size, struct bio *bio,
                          unsigned long bio_flags)
   {
@@@ -1582,7 -1581,7 +1582,7 @@@
   
         length = bio->bi_size;
         map_length = length;
- -      ret = btrfs_map_block(root->fs_info, READ, logical,
+ +      ret = btrfs_map_block(root->fs_info, rw, logical,
                               &map_length, NULL, 0);
         /* Will always return 0 with map_multi == NULL */
         BUG_ON(ret < 0);
@@@ -2479,6 -2478,18 +2479,18 @@@ int btrfs_orphan_cleanup(struct btrfs_r
                                 continue;
                         }
                         nr_truncate++;
+ 
+                       /* 1 for the orphan item deletion. */
+                       trans = btrfs_start_transaction(root, 1);
+                       if (IS_ERR(trans)) {
+                               ret = PTR_ERR(trans);
+                               goto out;
+                       }
+                       ret = btrfs_orphan_add(trans, inode);
+                       btrfs_end_transaction(trans, root);
+                       if (ret)
+                               goto out;
+ 
                         ret = btrfs_truncate(inode);
                 } else {
                         nr_unlink++;
@@@ -3666,6 -3677,7 +3678,7 @@@ int btrfs_cont_expand(struct inode *ino
                                 block_end - cur_offset, 0);
                 if (IS_ERR(em)) {
                         err = PTR_ERR(em);
+                       em = NULL;
                         break;
                 }
                 last_byte = min(extent_map_end(em), block_end);
@@@ -3749,16 -3761,27 +3762,27 @@@ next
         return err;
   }
   
- static int btrfs_setsize(struct inode *inode, loff_t newsize)
+ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
   {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_trans_handle *trans;
         loff_t oldsize = i_size_read(inode);
+       loff_t newsize = attr->ia_size;
+       int mask = attr->ia_valid;
         int ret;
   
         if (newsize == oldsize)
                 return 0;
   
+       /*
+        * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+        * special case where we need to update the times despite not having
+        * these flags set.  For all other operations the VFS set these flags
+        * explicitly if it wants a timestamp update.
+        */
+       if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
+               inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+ 
         if (newsize > oldsize) {
                 truncate_pagecache(inode, oldsize, newsize);
                 ret = btrfs_cont_expand(inode, oldsize, newsize);
@@@ -3784,9 -3807,34 +3808,34 @@@
                         set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
                                 &BTRFS_I(inode)->runtime_flags);
   
+               /*
+                * 1 for the orphan item we're going to add
+                * 1 for the orphan item deletion.
+                */
+               trans = btrfs_start_transaction(root, 2);
+               if (IS_ERR(trans))
+                       return PTR_ERR(trans);
+ 
+               /*
+                * We need to do this in case we fail at _any_ point during the
+                * actual truncate.  Once we do the truncate_setsize we could
+                * invalidate pages which forces any outstanding ordered io to
+                * be instantly completed which will give us extents that need
+                * to be truncated.  If we fail to get an orphan inode down we
+                * could have left over extents that were never meant to live,
+                * so we need to garuntee from this point on that everything
+                * will be consistent.
+                */
+               ret = btrfs_orphan_add(trans, inode);
+               btrfs_end_transaction(trans, root);
+               if (ret)
+                       return ret;
+ 
                 /* we don't support swapfiles, so vmtruncate shouldn't fail */
                 truncate_setsize(inode, newsize);
                 ret = btrfs_truncate(inode);
+               if (ret && inode->i_nlink)
+                       btrfs_orphan_del(NULL, inode);
         }
   
         return ret;
@@@ -3806,7 -3854,7 +3855,7 @@@ static int btrfs_setattr(struct dentry 
                 return err;
   
         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-               err = btrfs_setsize(inode, attr->ia_size);
+               err = btrfs_setsize(inode, attr);
                 if (err)
                         return err;
         }
@@@ -5587,10 -5635,13 +5636,13 @@@ struct extent_map *btrfs_get_extent_fie
                 return em;
         if (em) {
                 /*
-                * if our em maps to a hole, there might
-                * actually be delalloc bytes behind it
+                * if our em maps to
+                * -  a hole or
+                * -  a pre-alloc extent,
+                * there might actually be delalloc bytes behind it.
                  */
-               if (em->block_start != EXTENT_MAP_HOLE)
+               if (em->block_start != EXTENT_MAP_HOLE &&
+                   !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
                         return em;
                 else
                         hole_em = em;
@@@ -5672,6 -5723,8 +5724,8 @@@
                          */
                         em->block_start = hole_em->block_start;
                         em->block_len = hole_len;
+                       if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
+                               set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                 } else {
                         em->start = range_start;
                         em->len = found;
@@@ -6387,24 -6440,19 +6441,24 @@@ static int btrfs_submit_direct_hook(in
         int async_submit = 0;
   
         map_length = orig_bio->bi_size;
- -      ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
+ +      ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
                               &map_length, NULL, 0);
         if (ret) {
                 bio_put(orig_bio);
                 return -EIO;
         }
- -
         if (map_length >= orig_bio->bi_size) {
                 bio = orig_bio;
                 goto submit;
         }
   
- -      async_submit = 1;
+ +      /* async crcs make it difficult to collect full stripe writes. */
+ +      if (btrfs_get_alloc_profile(root, 1) &
+ +          (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
+ +              async_submit = 0;
+ +      else
+ +              async_submit = 1;
+ +
         bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
         if (!bio)
                 return -ENOMEM;
@@@ -6446,7 -6494,7 +6500,7 @@@
                         bio->bi_end_io = btrfs_end_dio_bio;
   
                         map_length = orig_bio->bi_size;
- -                      ret = btrfs_map_block(root->fs_info, READ,
+ +                      ret = btrfs_map_block(root->fs_info, rw,
                                               start_sector << 9,
                                               &map_length, NULL, 0);
                         if (ret) {
@@@ -6589,17 -6637,15 +6643,17 @@@ static ssize_t btrfs_direct_IO(int rw, 
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
+ +      ssize_t ret;
   
         if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
                             offset, nr_segs))
                 return 0;
   
- -      return __blockdev_direct_IO(rw, iocb, inode,
+ +      ret = __blockdev_direct_IO(rw, iocb, inode,
                    BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
                    iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
                    btrfs_submit_direct, 0);
+ +      return ret;
   }
   
   #define BTRFS_FIEMAP_FLAGS    (FIEMAP_FLAG_SYNC)
@@@ -6937,11 -6983,9 +6991,9 @@@ static int btrfs_truncate(struct inode 
   
         /*
          * 1 for the truncate slack space
-        * 1 for the orphan item we're going to add
-        * 1 for the orphan item deletion
          * 1 for updating the inode.
          */
-       trans = btrfs_start_transaction(root, 4);
+       trans = btrfs_start_transaction(root, 2);
         if (IS_ERR(trans)) {
                 err = PTR_ERR(trans);
                 goto out;
@@@ -6952,12 -6996,6 +7004,6 @@@
                                       min_size);
         BUG_ON(ret);
   
-       ret = btrfs_orphan_add(trans, inode);
-       if (ret) {
-               btrfs_end_transaction(trans, root);
-               goto out;
-       }
- 
         /*
          * setattr is responsible for setting the ordered_data_close flag,
          * but that is only tested during the last file release.  That
@@@ -7026,12 -7064,6 +7072,6 @@@
                 ret = btrfs_orphan_del(trans, inode);
                 if (ret)
                         err = ret;
-       } else if (ret && inode->i_nlink > 0) {
-               /*
-                * Failed to do the truncate, remove us from the in memory
-                * orphan list.
-                */
-               ret = btrfs_orphan_del(NULL, inode);
         }
   
         if (trans) {
@@@ -7553,41 -7585,61 +7593,61 @@@ void btrfs_wait_and_free_delalloc_work(
    */
   int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
   {
-       struct list_head *head = &root->fs_info->delalloc_inodes;
         struct btrfs_inode *binode;
         struct inode *inode;
         struct btrfs_delalloc_work *work, *next;
         struct list_head works;
+       struct list_head splice;
         int ret = 0;
   
         if (root->fs_info->sb->s_flags & MS_RDONLY)
                 return -EROFS;
   
         INIT_LIST_HEAD(&works);
- 
+       INIT_LIST_HEAD(&splice);
+ again:
         spin_lock(&root->fs_info->delalloc_lock);
-       while (!list_empty(head)) {
-               binode = list_entry(head->next, struct btrfs_inode,
+       list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+       while (!list_empty(&splice)) {
+               binode = list_entry(splice.next, struct btrfs_inode,
                                     delalloc_inodes);
+ 
+               list_del_init(&binode->delalloc_inodes);
+ 
                 inode = igrab(&binode->vfs_inode);
                 if (!inode)
-                       list_del_init(&binode->delalloc_inodes);
+                       continue;
+ 
+               list_add_tail(&binode->delalloc_inodes,
+                             &root->fs_info->delalloc_inodes);
                 spin_unlock(&root->fs_info->delalloc_lock);
-               if (inode) {
-                       work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
-                       if (!work) {
-                               ret = -ENOMEM;
-                               goto out;
-                       }
-                       list_add_tail(&work->list, &works);
-                       btrfs_queue_worker(&root->fs_info->flush_workers,
-                                          &work->work);
+ 
+               work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+               if (unlikely(!work)) {
+                       ret = -ENOMEM;
+                       goto out;
                 }
+               list_add_tail(&work->list, &works);
+               btrfs_queue_worker(&root->fs_info->flush_workers,
+                                  &work->work);
+ 
                 cond_resched();
                 spin_lock(&root->fs_info->delalloc_lock);
         }
         spin_unlock(&root->fs_info->delalloc_lock);
   
+       list_for_each_entry_safe(work, next, &works, list) {
+               list_del_init(&work->list);
+               btrfs_wait_and_free_delalloc_work(work);
+       }
+ 
+       spin_lock(&root->fs_info->delalloc_lock);
+       if (!list_empty(&root->fs_info->delalloc_inodes)) {
+               spin_unlock(&root->fs_info->delalloc_lock);
+               goto again;
+       }
+       spin_unlock(&root->fs_info->delalloc_lock);
+ 
         /* the filemap_flush will queue IO into the worker threads, but
          * we have to make sure the IO is actually started and that
          * ordered extents get created before we return
@@@ -7600,11 -7652,18 +7660,18 @@@
                     atomic_read(&root->fs_info->async_delalloc_pages) == 0));
         }
         atomic_dec(&root->fs_info->async_submit_draining);
+       return 0;
   out:
         list_for_each_entry_safe(work, next, &works, list) {
                 list_del_init(&work->list);
                 btrfs_wait_and_free_delalloc_work(work);
         }
+ 
+       if (!list_empty_careful(&splice)) {
+               spin_lock(&root->fs_info->delalloc_lock);
+               list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
+               spin_unlock(&root->fs_info->delalloc_lock);
+       }
         return ret;
   }
   
diff --combined fs/btrfs/transaction.c

index 1e7f176,f154946..c56b9d4
--- 1/fs/btrfs/transaction.c
--- 2/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@@ -156,9 -156,6 +156,9 @@@ loop
   
         spin_lock_init(&cur_trans->commit_lock);
         spin_lock_init(&cur_trans->delayed_refs.lock);
+ +      atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
+ +      atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
+ +      init_waitqueue_head(&cur_trans->delayed_refs.wait);
   
         INIT_LIST_HEAD(&cur_trans->pending_snapshots);
         list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@@ -580,7 -577,7 +580,7 @@@ static int __btrfs_end_transaction(stru
         if (!list_empty(&trans->new_bgs))
                 btrfs_create_pending_block_groups(trans, root);
   
- -      while (count < 2) {
+ +      while (count < 1) {
                 unsigned long cur = trans->delayed_ref_updates;
                 trans->delayed_ref_updates = 0;
                 if (cur &&
@@@ -592,7 -589,6 +592,7 @@@
                 }
                 count++;
         }
+ +
         btrfs_trans_release_metadata(trans, root);
         trans->block_rsv = NULL;
   
@@@ -690,9 -686,7 +690,9 @@@ int btrfs_write_marked_extents(struct b
         struct extent_state *cached_state = NULL;
         u64 start = 0;
         u64 end;
+ +      struct blk_plug plug;
   
+ +      blk_start_plug(&plug);
         while (!find_first_extent_bit(dirty_pages, start, &start, &end,
                                       mark, &cached_state)) {
                 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@@ -706,7 -700,6 +706,7 @@@
         }
         if (err)
                 werr = err;
+ +      blk_finish_plug(&plug);
         return werr;
   }
   
@@@ -1475,7 -1468,8 +1475,8 @@@ int btrfs_commit_transaction(struct btr
                 goto cleanup_transaction;
         }
   
-       if (cur_trans->aborted) {
+       /* Stop the commit early if ->aborted is set */
+       if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
                 ret = cur_trans->aborted;
                 goto cleanup_transaction;
         }
@@@ -1581,6 -1575,11 +1582,11 @@@
         wait_event(cur_trans->writer_wait,
                    atomic_read(&cur_trans->num_writers) == 1);
   
+       /* ->aborted might be set after the previous check, so check it */
+       if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+               ret = cur_trans->aborted;
+               goto cleanup_transaction;
+       }
         /*
          * the reloc mutex makes sure that we stop
          * the balancing code from coming in and moving
@@@ -1664,6 -1663,17 +1670,17 @@@
                 goto cleanup_transaction;
         }
   
+       /*
+        * The tasks which save the space cache and inode cache may also
+        * update ->aborted, check it.
+        */
+       if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+               ret = cur_trans->aborted;
+               mutex_unlock(&root->fs_info->tree_log_mutex);
+               mutex_unlock(&root->fs_info->reloc_mutex);
+               goto cleanup_transaction;
+       }
+ 
         btrfs_prepare_extent_commit(trans, root);
   
         cur_trans = root->fs_info->running_transaction;
diff --combined fs/btrfs/volumes.c

index 77620f2,15f6efd..8818dc3
--- 1/fs/btrfs/volumes.c
--- 2/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@@ -25,8 -25,6 +25,8 @@@
   #include <linux/capability.h>
   #include <linux/ratelimit.h>
   #include <linux/kthread.h>
+ +#include <linux/raid/pq.h>
+ +#include <asm/div64.h>
   #include "compat.h"
   #include "ctree.h"
   #include "extent_map.h"
@@@ -34,7 -32,6 +34,7 @@@
   #include "transaction.h"
   #include "print-tree.h"
   #include "volumes.h"
+ +#include "raid56.h"
   #include "async-thread.h"
   #include "check-integrity.h"
   #include "rcu-string.h"
@@@ -1406,21 -1403,6 +1406,21 @@@ int btrfs_rm_device(struct btrfs_root *
                 goto out;
         }
   
+ +      if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
+ +          root->fs_info->fs_devices->rw_devices <= 2) {
+ +              printk(KERN_ERR "btrfs: unable to go below two "
+ +                     "devices on raid5\n");
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +      if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
+ +          root->fs_info->fs_devices->rw_devices <= 3) {
+ +              printk(KERN_ERR "btrfs: unable to go below three "
+ +                     "devices on raid6\n");
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
         if (strcmp(device_path, "missing") == 0) {
                 struct list_head *devices;
                 struct btrfs_device *tmp;
@@@ -1449,7 -1431,7 +1449,7 @@@
                 }
         } else {
                 ret = btrfs_get_bdev_and_sb(device_path,
-                                           FMODE_READ | FMODE_EXCL,
+                                           FMODE_WRITE | FMODE_EXCL,
                                             root->fs_info->bdev_holder, 0,
                                             &bdev, &bh);
                 if (ret)
@@@ -1574,8 -1556,7 +1574,8 @@@
         ret = 0;
   
         /* Notify udev that device has changed */
- -      btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+ +      if (bdev)
+ +              btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
   
   error_brelse:
         brelse(bh);
@@@ -2633,7 -2614,14 +2633,14 @@@ static int chunk_usage_filter(struct bt
         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
         chunk_used = btrfs_block_group_used(&cache->item);
   
-       user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+       if (bargs->usage == 0)
+               user_thresh = 0;
+       else if (bargs->usage > 100)
+               user_thresh = cache->key.offset;
+       else
+               user_thresh = div_factor_fine(cache->key.offset,
+                                             bargs->usage);
+ 
         if (chunk_used < user_thresh)
                 ret = 0;
   
@@@ -2675,15 -2663,11 +2682,15 @@@ static int chunk_drange_filter(struct e
                 return 0;
   
         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
- -           BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
- -              factor = 2;
- -      else
- -              factor = 1;
- -      factor = num_stripes / factor;
+ +           BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
+ +              factor = num_stripes / 2;
+ +      } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
+ +              factor = num_stripes - 1;
+ +      } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
+ +              factor = num_stripes - 2;
+ +      } else {
+ +              factor = num_stripes;
+ +      }
   
         for (i = 0; i < num_stripes; i++) {
                 stripe = btrfs_stripe_nr(chunk, i);
@@@ -2982,6 -2966,8 +2989,8 @@@ static void __cancel_balance(struct btr
         unset_balance_control(fs_info);
         ret = del_balance_item(fs_info->tree_root);
         BUG_ON(ret);
+ 
+       atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
   }
   
   void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
@@@ -2998,7 -2984,6 +3007,7 @@@ int btrfs_balance(struct btrfs_balance_
         int mixed = 0;
         int ret;
         u64 num_devices;
+ +      int cancel = 0;
   
         if (btrfs_fs_closing(fs_info) ||
             atomic_read(&fs_info->balance_pause_req) ||
@@@ -3041,9 -3026,7 +3050,9 @@@
                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
         else
                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
- -                              BTRFS_BLOCK_GROUP_RAID10);
+ +                              BTRFS_BLOCK_GROUP_RAID10 |
+ +                              BTRFS_BLOCK_GROUP_RAID5 |
+ +                              BTRFS_BLOCK_GROUP_RAID6);
   
         if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
             (!alloc_profile_is_valid(bctl->data.target, 1) ||
@@@ -3083,10 -3066,7 +3092,10 @@@
   
         /* allow to reduce meta or sys integrity only if force set */
         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
- -                      BTRFS_BLOCK_GROUP_RAID10;
+ +                      BTRFS_BLOCK_GROUP_RAID10 |
+ +                      BTRFS_BLOCK_GROUP_RAID5 |
+ +                      BTRFS_BLOCK_GROUP_RAID6;
+ +
         if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
              (fs_info->avail_system_alloc_bits & allowed) &&
              !(bctl->sys.target & allowed)) ||
@@@ -3152,25 -3132,25 +3161,27 @@@
         }
   
         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
- -          balance_need_close(fs_info)) {
- -              __cancel_balance(fs_info);
- -      }
+ +          balance_need_close(fs_info))
+ +              cancel = 1;
   
         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
                 fs_info->num_tolerated_disk_barrier_failures =
                         btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
         }
   
+ +      if (cancel)
+ +              __cancel_balance(fs_info);
+ +
         wake_up(&fs_info->balance_wait_q);
   
         return ret;
   out:
         if (bctl->flags & BTRFS_BALANCE_RESUME)
                 __cancel_balance(fs_info);
-       else
+       else {
                 kfree(bctl);
+               atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+       }
         return ret;
   }
   
@@@ -3187,7 -3167,6 +3198,6 @@@ static int balance_kthread(void *data
                 ret = btrfs_balance(fs_info->balance_ctl, NULL);
         }
   
-       atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
         mutex_unlock(&fs_info->balance_mutex);
         mutex_unlock(&fs_info->volume_mutex);
   
@@@ -3210,7 -3189,6 +3220,6 @@@ int btrfs_resume_balance_async(struct b
                 return 0;
         }
   
-       WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
         if (IS_ERR(tsk))
                 return PTR_ERR(tsk);
@@@ -3264,6 -3242,8 +3273,8 @@@ int btrfs_recover_balance(struct btrfs_
         btrfs_balance_sys(leaf, item, &disk_bargs);
         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
   
+       WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+ 
         mutex_lock(&fs_info->volume_mutex);
         mutex_lock(&fs_info->balance_mutex);
   
@@@ -3523,45 -3503,13 +3534,45 @@@ static int btrfs_cmp_device_info(const 
   }
   
   struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+ +      /*
+ +       * sub_stripes info for map,
+ +       * dev_stripes -- stripes per dev, 2 for DUP, 1 other wise
+ +       * devs_max -- max devices per stripe, 0 for unlimited
+ +       * devs_min -- min devices per stripe
+ +       * devs_increment -- ndevs must be a multiple of this
+ +       * ncopies -- how many copies of the data we have
+ +       */
         { 2, 1, 0, 4, 2, 2 /* raid10 */ },
         { 1, 1, 2, 2, 2, 2 /* raid1 */ },
         { 1, 2, 1, 1, 1, 2 /* dup */ },
         { 1, 1, 0, 2, 1, 1 /* raid0 */ },
-       { 1, 1, 0, 1, 1, 1 /* single */ },
+       { 1, 1, 1, 1, 1, 1 /* single */ },
+ +      { 1, 1, 0, 2, 1, 2 /* raid5 */ },
+ +      { 1, 1, 0, 3, 1, 3 /* raid6 */ },
   };
   
+ +static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
+ +{
+ +      /* TODO allow them to set a preferred stripe size */
+ +      return 64 * 1024;
+ +}
+ +
+ +static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
+ +{
+ +      u64 features;
+ +
+ +      if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
+ +              return;
+ +
+ +      features = btrfs_super_incompat_flags(info->super_copy);
+ +      if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
+ +              return;
+ +
+ +      features |= BTRFS_FEATURE_INCOMPAT_RAID56;
+ +      btrfs_set_super_incompat_flags(info->super_copy, features);
+ +      printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
+ +}
+ +
   static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                                struct btrfs_root *extent_root,
                                struct map_lookup **map_ret,
@@@ -3577,8 -3525,6 +3588,8 @@@
         struct btrfs_device_info *devices_info = NULL;
         u64 total_avail;
         int num_stripes;        /* total number of stripes to allocate */
+ +      int data_stripes;       /* number of stripes that count for
+ +                                 block group size */
         int sub_stripes;        /* sub_stripes info for map */
         int dev_stripes;        /* stripes per dev */
         int devs_max;           /* max devs to use */
@@@ -3590,7 -3536,6 +3601,7 @@@
         u64 max_chunk_size;
         u64 stripe_size;
         u64 num_bytes;
+ +      u64 raid_stripe_len = BTRFS_STRIPE_LEN;
         int ndevs;
         int i;
         int j;
@@@ -3716,31 -3661,16 +3727,31 @@@
         stripe_size = devices_info[ndevs-1].max_avail;
         num_stripes = ndevs * dev_stripes;
   
+ +      /*
+ +       * this will have to be fixed for RAID1 and RAID10 over
+ +       * more drives
+ +       */
+ +      data_stripes = num_stripes / ncopies;
+ +
         if (stripe_size * ndevs > max_chunk_size * ncopies) {
                 stripe_size = max_chunk_size * ncopies;
                 do_div(stripe_size, ndevs);
         }
- -
+ +      if (type & BTRFS_BLOCK_GROUP_RAID5) {
+ +              raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
+ +                               btrfs_super_stripesize(info->super_copy));
+ +              data_stripes = num_stripes - 1;
+ +      }
+ +      if (type & BTRFS_BLOCK_GROUP_RAID6) {
+ +              raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
+ +                               btrfs_super_stripesize(info->super_copy));
+ +              data_stripes = num_stripes - 2;
+ +      }
         do_div(stripe_size, dev_stripes);
   
         /* align to BTRFS_STRIPE_LEN */
- -      do_div(stripe_size, BTRFS_STRIPE_LEN);
- -      stripe_size *= BTRFS_STRIPE_LEN;
+ +      do_div(stripe_size, raid_stripe_len);
+ +      stripe_size *= raid_stripe_len;
   
         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
         if (!map) {
@@@ -3758,14 -3688,14 +3769,14 @@@
                 }
         }
         map->sector_size = extent_root->sectorsize;
- -      map->stripe_len = BTRFS_STRIPE_LEN;
- -      map->io_align = BTRFS_STRIPE_LEN;
- -      map->io_width = BTRFS_STRIPE_LEN;
+ +      map->stripe_len = raid_stripe_len;
+ +      map->io_align = raid_stripe_len;
+ +      map->io_width = raid_stripe_len;
         map->type = type;
         map->sub_stripes = sub_stripes;
   
         *map_ret = map;
- -      num_bytes = stripe_size * (num_stripes / ncopies);
+ +      num_bytes = stripe_size * data_stripes;
   
         *stripe_size_out = stripe_size;
         *num_bytes_out = num_bytes;
@@@ -3814,8 -3744,6 +3825,8 @@@
                 }
         }
   
+ +      check_raid56_incompat_flag(extent_root->fs_info, type);
+ +
         kfree(devices_info);
         return 0;
   
@@@ -4085,10 -4013,6 +4096,10 @@@ int btrfs_num_copies(struct btrfs_fs_in
                 ret = map->num_stripes;
         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
                 ret = map->sub_stripes;
+ +      else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ +              ret = 2;
+ +      else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ +              ret = 3;
         else
                 ret = 1;
         free_extent_map(em);
@@@ -4101,52 -4025,6 +4112,52 @@@
         return ret;
   }
   
+ +unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+ +                                  struct btrfs_mapping_tree *map_tree,
+ +                                  u64 logical)
+ +{
+ +      struct extent_map *em;
+ +      struct map_lookup *map;
+ +      struct extent_map_tree *em_tree = &map_tree->map_tree;
+ +      unsigned long len = root->sectorsize;
+ +
+ +      read_lock(&em_tree->lock);
+ +      em = lookup_extent_mapping(em_tree, logical, len);
+ +      read_unlock(&em_tree->lock);
+ +      BUG_ON(!em);
+ +
+ +      BUG_ON(em->start > logical || em->start + em->len < logical);
+ +      map = (struct map_lookup *)em->bdev;
+ +      if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ +                       BTRFS_BLOCK_GROUP_RAID6)) {
+ +              len = map->stripe_len * nr_data_stripes(map);
+ +      }
+ +      free_extent_map(em);
+ +      return len;
+ +}
+ +
+ +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+ +                         u64 logical, u64 len, int mirror_num)
+ +{
+ +      struct extent_map *em;
+ +      struct map_lookup *map;
+ +      struct extent_map_tree *em_tree = &map_tree->map_tree;
+ +      int ret = 0;
+ +
+ +      read_lock(&em_tree->lock);
+ +      em = lookup_extent_mapping(em_tree, logical, len);
+ +      read_unlock(&em_tree->lock);
+ +      BUG_ON(!em);
+ +
+ +      BUG_ON(em->start > logical || em->start + em->len < logical);
+ +      map = (struct map_lookup *)em->bdev;
+ +      if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ +                       BTRFS_BLOCK_GROUP_RAID6))
+ +              ret = 1;
+ +      free_extent_map(em);
+ +      return ret;
+ +}
+ +
   static int find_live_mirror(struct btrfs_fs_info *fs_info,
                             struct map_lookup *map, int first, int num,
                             int optimal, int dev_replace_is_ongoing)
@@@ -4184,39 -4062,10 +4195,39 @@@
         return optimal;
   }
   
+ +static inline int parity_smaller(u64 a, u64 b)
+ +{
+ +      return a > b;
+ +}
+ +
+ +/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
+ +static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
+ +{
+ +      struct btrfs_bio_stripe s;
+ +      int i;
+ +      u64 l;
+ +      int again = 1;
+ +
+ +      while (again) {
+ +              again = 0;
+ +              for (i = 0; i < bbio->num_stripes - 1; i++) {
+ +                      if (parity_smaller(raid_map[i], raid_map[i+1])) {
+ +                              s = bbio->stripes[i];
+ +                              l = raid_map[i];
+ +                              bbio->stripes[i] = bbio->stripes[i+1];
+ +                              raid_map[i] = raid_map[i+1];
+ +                              bbio->stripes[i+1] = s;
+ +                              raid_map[i+1] = l;
+ +                              again = 1;
+ +                      }
+ +              }
+ +      }
+ +}
+ +
   static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                              u64 logical, u64 *length,
                              struct btrfs_bio **bbio_ret,
- -                           int mirror_num)
+ +                           int mirror_num, u64 **raid_map_ret)
   {
         struct extent_map *em;
         struct map_lookup *map;
@@@ -4228,8 -4077,6 +4239,8 @@@
         u64 stripe_nr;
         u64 stripe_nr_orig;
         u64 stripe_nr_end;
+ +      u64 stripe_len;
+ +      u64 *raid_map = NULL;
         int stripe_index;
         int i;
         int ret = 0;
@@@ -4241,7 -4088,6 +4252,7 @@@
         int num_alloc_stripes;
         int patch_the_first_stripe_for_dev_replace = 0;
         u64 physical_to_patch_in_first_stripe = 0;
+ +      u64 raid56_full_stripe_start = (u64)-1;
   
         read_lock(&em_tree->lock);
         em = lookup_extent_mapping(em_tree, logical, *length);
@@@ -4258,63 -4104,29 +4269,63 @@@
         map = (struct map_lookup *)em->bdev;
         offset = logical - em->start;
   
+ +      if (mirror_num > map->num_stripes)
+ +              mirror_num = 0;
+ +
+ +      stripe_len = map->stripe_len;
         stripe_nr = offset;
         /*
          * stripe_nr counts the total number of stripes we have to stride
          * to get to this block
          */
- -      do_div(stripe_nr, map->stripe_len);
+ +      do_div(stripe_nr, stripe_len);
   
- -      stripe_offset = stripe_nr * map->stripe_len;
+ +      stripe_offset = stripe_nr * stripe_len;
         BUG_ON(offset < stripe_offset);
   
         /* stripe_offset is the offset of this block in its stripe*/
         stripe_offset = offset - stripe_offset;
   
- -      if (rw & REQ_DISCARD)
+ +      /* if we're here for raid56, we need to know the stripe aligned start */
+ +      if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+ +              unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
+ +              raid56_full_stripe_start = offset;
+ +
+ +              /* allow a write of a full stripe, but make sure we don't
+ +               * allow straddling of stripes
+ +               */
+ +              do_div(raid56_full_stripe_start, full_stripe_len);
+ +              raid56_full_stripe_start *= full_stripe_len;
+ +      }
+ +
+ +      if (rw & REQ_DISCARD) {
+ +              /* we don't discard raid56 yet */
+ +              if (map->type &
+ +                  (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+ +                      ret = -EOPNOTSUPP;
+ +                      goto out;
+ +              }
                 *length = min_t(u64, em->len - offset, *length);
- -      else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
- -              /* we limit the length of each bio to what fits in a stripe */
- -              *length = min_t(u64, em->len - offset,
- -                              map->stripe_len - stripe_offset);
+ +      } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ +              u64 max_len;
+ +              /* For writes to RAID[56], allow a full stripeset across all disks.
+ +                 For other RAID types and for RAID[56] reads, just allow a single
+ +                 stripe (on a single disk). */
+ +              if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+ +                  (rw & REQ_WRITE)) {
+ +                      max_len = stripe_len * nr_data_stripes(map) -
+ +                              (offset - raid56_full_stripe_start);
+ +              } else {
+ +                      /* we limit the length of each bio to what fits in a stripe */
+ +                      max_len = stripe_len - stripe_offset;
+ +              }
+ +              *length = min_t(u64, em->len - offset, max_len);
         } else {
                 *length = em->len - offset;
         }
   
+ +      /* This is for when we're called from btrfs_merge_bio_hook() and all
+ +         it cares about is the length */
         if (!bbio_ret)
                 goto out;
   
@@@ -4347,7 -4159,7 +4358,7 @@@
                 u64 physical_of_found = 0;
   
                 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
- -                           logical, &tmp_length, &tmp_bbio, 0);
+ +                           logical, &tmp_length, &tmp_bbio, 0, NULL);
                 if (ret) {
                         WARN_ON(tmp_bbio != NULL);
                         goto out;
@@@ -4413,7 -4225,6 +4424,7 @@@
         do_div(stripe_nr_end, map->stripe_len);
         stripe_end_offset = stripe_nr_end * map->stripe_len -
                             (offset + *length);
+ +
         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
                 if (rw & REQ_DISCARD)
                         num_stripes = min_t(u64, map->num_stripes,
@@@ -4464,65 -4275,6 +4475,65 @@@
                                               dev_replace_is_ongoing);
                         mirror_num = stripe_index - old_stripe_index + 1;
                 }
+ +
+ +      } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ +                              BTRFS_BLOCK_GROUP_RAID6)) {
+ +              u64 tmp;
+ +
+ +              if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
+ +                  && raid_map_ret) {
+ +                      int i, rot;
+ +
+ +                      /* push stripe_nr back to the start of the full stripe */
+ +                      stripe_nr = raid56_full_stripe_start;
+ +                      do_div(stripe_nr, stripe_len);
+ +
+ +                      stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+ +
+ +                      /* RAID[56] write or recovery. Return all stripes */
+ +                      num_stripes = map->num_stripes;
+ +                      max_errors = nr_parity_stripes(map);
+ +
+ +                      raid_map = kmalloc(sizeof(u64) * num_stripes,
+ +                                         GFP_NOFS);
+ +                      if (!raid_map) {
+ +                              ret = -ENOMEM;
+ +                              goto out;
+ +                      }
+ +
+ +                      /* Work out the disk rotation on this stripe-set */
+ +                      tmp = stripe_nr;
+ +                      rot = do_div(tmp, num_stripes);
+ +
+ +                      /* Fill in the logical address of each stripe */
+ +                      tmp = stripe_nr * nr_data_stripes(map);
+ +                      for (i = 0; i < nr_data_stripes(map); i++)
+ +                              raid_map[(i+rot) % num_stripes] =
+ +                                      em->start + (tmp + i) * map->stripe_len;
+ +
+ +                      raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ +                      if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ +                              raid_map[(i+rot+1) % num_stripes] =
+ +                                      RAID6_Q_STRIPE;
+ +
+ +                      *length = map->stripe_len;
+ +                      stripe_index = 0;
+ +                      stripe_offset = 0;
+ +              } else {
+ +                      /*
+ +                       * Mirror #0 or #1 means the original data block.
+ +                       * Mirror #2 is RAID5 parity block.
+ +                       * Mirror #3 is RAID6 Q block.
+ +                       */
+ +                      stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+ +                      if (mirror_num > 1)
+ +                              stripe_index = nr_data_stripes(map) +
+ +                                              mirror_num - 2;
+ +
+ +                      /* We distribute the parity blocks across stripes */
+ +                      tmp = stripe_nr + stripe_index;
+ +                      stripe_index = do_div(tmp, map->num_stripes);
+ +              }
         } else {
                 /*
                  * after this do_div call, stripe_nr is the number of stripes
@@@ -4631,11 -4383,8 +4642,11 @@@
         if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
                                  BTRFS_BLOCK_GROUP_RAID10 |
+ +                               BTRFS_BLOCK_GROUP_RAID5 |
                                  BTRFS_BLOCK_GROUP_DUP)) {
                         max_errors = 1;
+ +              } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
+ +                      max_errors = 2;
                 }
         }
   
@@@ -4736,10 -4485,6 +4747,10 @@@
                 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
                 bbio->mirror_num = map->num_stripes + 1;
         }
+ +      if (raid_map) {
+ +              sort_parity_stripes(bbio, raid_map);
+ +              *raid_map_ret = raid_map;
+ +      }
   out:
         if (dev_replace_is_ongoing)
                 btrfs_dev_replace_unlock(dev_replace);
@@@ -4752,7 -4497,7 +4763,7 @@@ int btrfs_map_block(struct btrfs_fs_inf
                       struct btrfs_bio **bbio_ret, int mirror_num)
   {
         return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
- -                               mirror_num);
+ +                               mirror_num, NULL);
   }
   
   int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@@ -4766,7 -4511,6 +4777,7 @@@
         u64 bytenr;
         u64 length;
         u64 stripe_nr;
+ +      u64 rmap_len;
         int i, j, nr = 0;
   
         read_lock(&em_tree->lock);
@@@ -4777,17 -4521,10 +4788,17 @@@
         map = (struct map_lookup *)em->bdev;
   
         length = em->len;
+ +      rmap_len = map->stripe_len;
+ +
         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
                 do_div(length, map->num_stripes / map->sub_stripes);
         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
                 do_div(length, map->num_stripes);
+ +      else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ +                            BTRFS_BLOCK_GROUP_RAID6)) {
+ +              do_div(length, nr_data_stripes(map));
+ +              rmap_len = map->stripe_len * nr_data_stripes(map);
+ +      }
   
         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
         BUG_ON(!buf); /* -ENOMEM */
@@@ -4807,11 -4544,8 +4818,11 @@@
                         do_div(stripe_nr, map->sub_stripes);
                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
                         stripe_nr = stripe_nr * map->num_stripes + i;
- -              }
- -              bytenr = chunk_start + stripe_nr * map->stripe_len;
+ +              } /* else if RAID[56], multiply by nr_data_stripes().
+ +                 * Alternatively, just use rmap_len below instead of
+ +                 * map->stripe_len */
+ +
+ +              bytenr = chunk_start + stripe_nr * rmap_len;
                 WARN_ON(nr >= map->num_stripes);
                 for (j = 0; j < nr; j++) {
                         if (buf[j] == bytenr)
@@@ -4825,7 -4559,7 +4836,7 @@@
   
         *logical = buf;
         *naddrs = nr;
- -      *stripe_len = map->stripe_len;
+ +      *stripe_len = rmap_len;
   
         free_extent_map(em);
         return 0;
@@@ -4899,7 -4633,7 +4910,7 @@@ static void btrfs_end_bio(struct bio *b
                 bio->bi_bdev = (struct block_device *)
                                         (unsigned long)bbio->mirror_num;
                 /* only send an error to the higher layers if it is
- -               * beyond the tolerance of the multi-bio
+ +               * beyond the tolerance of the btrfs bio
                  */
                 if (atomic_read(&bbio->error) > bbio->max_errors) {
                         err = -EIO;
@@@ -4933,18 -4667,13 +4944,18 @@@ struct async_sched 
    * This will add one bio to the pending list for a device and make sure
    * the work struct is scheduled.
    */
- -static noinline void schedule_bio(struct btrfs_root *root,
+ +noinline void btrfs_schedule_bio(struct btrfs_root *root,
                                  struct btrfs_device *device,
                                  int rw, struct bio *bio)
   {
         int should_queue = 1;
         struct btrfs_pending_bios *pending_bios;
   
+ +      if (device->missing || !device->bdev) {
+ +              bio_endio(bio, -EIO);
+ +              return;
+ +      }
+ +
         /* don't bother with additional async steps for reads, right now */
         if (!(rw & REQ_WRITE)) {
                 bio_get(bio);
@@@ -5042,7 -4771,7 +5053,7 @@@ static void submit_stripe_bio(struct bt
   #endif
         bio->bi_bdev = dev->bdev;
         if (async)
- -              schedule_bio(root, dev, rw, bio);
+ +              btrfs_schedule_bio(root, dev, rw, bio);
         else
                 btrfsic_submit_bio(rw, bio);
   }
@@@ -5101,7 -4830,6 +5112,7 @@@ int btrfs_map_bio(struct btrfs_root *ro
         u64 logical = (u64)bio->bi_sector << 9;
         u64 length = 0;
         u64 map_length;
+ +      u64 *raid_map = NULL;
         int ret;
         int dev_nr = 0;
         int total_devs = 1;
@@@ -5110,30 -4838,12 +5121,30 @@@
         length = bio->bi_size;
         map_length = length;
   
- -      ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
- -                            mirror_num);
- -      if (ret)
+ +      ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
+ +                            mirror_num, &raid_map);
+ +      if (ret) /* -ENOMEM */
                 return ret;
   
         total_devs = bbio->num_stripes;
+ +      bbio->orig_bio = first_bio;
+ +      bbio->private = first_bio->bi_private;
+ +      bbio->end_io = first_bio->bi_end_io;
+ +      atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+ +
+ +      if (raid_map) {
+ +              /* In this case, map_length has been set to the length of
+ +                 a single stripe; not the whole write */
+ +              if (rw & WRITE) {
+ +                      return raid56_parity_write(root, bio, bbio,
+ +                                                 raid_map, map_length);
+ +              } else {
+ +                      return raid56_parity_recover(root, bio, bbio,
+ +                                                   raid_map, map_length,
+ +                                                   mirror_num);
+ +              }
+ +      }
+ +
         if (map_length < length) {
                 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
                        "len %llu\n", (unsigned long long)logical,
@@@ -5142,6 -4852,11 +5153,6 @@@
                 BUG();
         }
   
- -      bbio->orig_bio = first_bio;
- -      bbio->private = first_bio->bi_private;
- -      bbio->end_io = first_bio->bi_end_io;
- -      atomic_set(&bbio->stripes_pending, bbio->num_stripes);
- -
         while (dev_nr < total_devs) {
                 dev = bbio->stripes[dev_nr].dev;
                 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
author	Chris Mason <chris.mason@fusionio.com>
	Tue, 5 Feb 2013 15:04:03 +0000 (10:04 -0500)
committer	Chris Mason <chris.mason@fusionio.com>
	Tue, 5 Feb 2013 15:04:03 +0000 (10:04 -0500)
		1	2
fs/btrfs/extent-tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/free-space-cache.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/transaction.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/volumes.c	patch \|	diff1 \|	diff2 \|	blob \| history