ext4: trim allocation requests to group size

[pandora-kernel.git] / fs / ext4 / mballoc.c
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c

index e2d8be8..5efe721 100644 (file)
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -652,7 +652,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
         ext4_grpblk_t min;
         ext4_grpblk_t max;
         ext4_grpblk_t chunk;
-       unsigned short border;
+       unsigned int border;
  
         BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
  
@@ -1312,6 +1312,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
         void *buddy2;
         struct super_block *sb = e4b->bd_sb;
  
+       if (WARN_ON(count == 0))
+               return;
         BUG_ON(first + count > (sb->s_blocksize << 3));
         assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
         mb_check_buddy(e4b);
@@ -2027,7 +2029,11 @@ repeat:
                 group = ac->ac_g_ex.fe_group;
  
                 for (i = 0; i < ngroups; group++, i++) {
-                       if (group == ngroups)
+                       /*
+                        * Artificially restricted ngroups for non-extent
+                        * files makes group > ngroups possible on first loop.
+                        */
+                       if (group >= ngroups)
                                 group = 0;
  
                         /* This now checks without needing the buddy page */
@@ -2128,7 +2134,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
         struct ext4_buddy e4b;
         struct sg {
                 struct ext4_group_info info;
-               ext4_grpblk_t counters[16];
+               ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
         } sg;
  
         group--;
@@ -2567,6 +2573,9 @@ int ext4_mb_release(struct super_block *sb)
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
  
+       if (sbi->s_proc)
+               remove_proc_entry("mb_groups", sbi->s_proc);
+
         if (sbi->s_group_info) {
                 for (i = 0; i < ngroups; i++) {
                         grinfo = ext4_get_group_info(sb, i);
@@ -2614,8 +2623,6 @@ int ext4_mb_release(struct super_block *sb)
         }
  
         free_percpu(sbi->s_locality_groups);
-       if (sbi->s_proc)
-               remove_proc_entry("mb_groups", sbi->s_proc);
  
         return 0;
  }
@@ -2817,7 +2824,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
                            "fs metadata\n", block, block+len);
                 /* File system mounted not to panic on error
-                * Fix the bitmap and repeat the block allocation
+                * Fix the bitmap and return EIO
                  * We leak some of the blocks here.
                  */
                 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
@@ -2826,7 +2833,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
                 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                 if (!err)
-                       err = -EAGAIN;
+                       err = -EIO;
                 goto out_err;
         }
  
@@ -2865,8 +2872,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
         if (sbi->s_log_groups_per_flex) {
                 ext4_group_t flex_group = ext4_flex_group(sbi,
                                                           ac->ac_b_ex.fe_group);
-               atomic_sub(ac->ac_b_ex.fe_len,
-                          &sbi->s_flex_groups[flex_group].free_clusters);
+               atomic64_sub(ac->ac_b_ex.fe_len,
+                            &sbi->s_flex_groups[flex_group].free_clusters);
         }
  
         err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -2996,6 +3003,13 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
         if (ar->pright && start + size - 1 >= ar->lright)
                 size -= start + size - ar->lright;
  
+       /*
+        * Trim allocation request for filesystems with artificially small
+        * groups.
+        */
+       if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
+               size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
+
         end = start + size;
  
         /* check we don't cross already preallocated blocks */
@@ -3062,7 +3076,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
         }
         BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
                         start > ac->ac_o_ex.fe_logical);
-       BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+       BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
  
         /* now prepare goal request */
  
@@ -3123,13 +3137,31 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
  static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
  {
         struct ext4_prealloc_space *pa = ac->ac_pa;
-       int len;
+       struct ext4_buddy e4b;
+       int err;
  
-       if (pa && pa->pa_type == MB_INODE_PA) {
-               len = ac->ac_b_ex.fe_len;
-               pa->pa_free += len;
+       if (pa == NULL) {
+               if (ac->ac_f_ex.fe_len == 0)
+                       return;
+               err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
+               if (err) {
+                       /*
+                        * This should never happen since we pin the
+                        * pages in the ext4_allocation_context so
+                        * ext4_mb_load_buddy() should never fail.
+                        */
+                       WARN(1, "mb_load_buddy failed (%d)", err);
+                       return;
+               }
+               ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
+               mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
+                              ac->ac_f_ex.fe_len);
+               ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
+               ext4_mb_unload_buddy(&e4b);
+               return;
         }
-
+       if (pa->pa_type == MB_INODE_PA)
+               pa->pa_free += ac->ac_b_ex.fe_len;
  }
  
  /*
@@ -3373,6 +3405,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
  {
         struct ext4_prealloc_space *pa;
         pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+
+       BUG_ON(atomic_read(&pa->pa_count));
+       BUG_ON(pa->pa_deleted == 0);
         kmem_cache_free(ext4_pspace_cachep, pa);
  }
  
@@ -3386,11 +3421,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
         ext4_group_t grp;
         ext4_fsblk_t grp_blk;
  
-       if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
-               return;
-
         /* in this short window concurrent discard can set pa_deleted */
         spin_lock(&pa->pa_lock);
+       if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
+               spin_unlock(&pa->pa_lock);
+               return;
+       }
+
         if (pa->pa_deleted == 1) {
                 spin_unlock(&pa->pa_lock);
                 return;
@@ -3484,7 +3521,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
                         win = offs;
  
                 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
-                       EXT4_B2C(sbi, win);
+                       EXT4_NUM_B2C(sbi, win);
                 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
                 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
         }
@@ -4057,7 +4094,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
  
         /* set up allocation goals */
         memset(ac, 0, sizeof(struct ext4_allocation_context));
-       ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
+       ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
         ac->ac_status = AC_STATUS_CONTINUE;
         ac->ac_sb = sb;
         ac->ac_inode = ar->inode;
@@ -4177,7 +4214,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
                 /* The max size of hash table is PREALLOC_TB_SIZE */
                 order = PREALLOC_TB_SIZE - 1;
         /* Add the prealloc space to lg */
-       rcu_read_lock();
+       spin_lock(&lg->lg_prealloc_lock);
         list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
                                                 pa_inode_list) {
                 spin_lock(&tmp_pa->pa_lock);
@@ -4201,12 +4238,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
         if (!added)
                 list_add_tail_rcu(&pa->pa_inode_list,
                                         &lg->lg_prealloc_list[order]);
-       rcu_read_unlock();
+       spin_unlock(&lg->lg_prealloc_lock);
  
         /* Now trim the list to be not more than 8 elements */
         if (lg_prealloc_count > 8) {
                 ext4_mb_discard_lg_preallocations(sb, lg,
-                                               order, lg_prealloc_count);
+                                                 order, lg_prealloc_count);
                 return;
         }
         return ;
@@ -4371,18 +4408,7 @@ repeat:
         }
         if (likely(ac->ac_status == AC_STATUS_FOUND)) {
                 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
-               if (*errp == -EAGAIN) {
-                       /*
-                        * drop the reference that we took
-                        * in ext4_mb_use_best_found
-                        */
-                       ext4_mb_release_context(ac);
-                       ac->ac_b_ex.fe_group = 0;
-                       ac->ac_b_ex.fe_start = 0;
-                       ac->ac_b_ex.fe_len = 0;
-                       ac->ac_status = AC_STATUS_CONTINUE;
-                       goto repeat;
-               } else if (*errp)
+               if (*errp)
                 errout:
                         ext4_discard_allocated_blocks(ac);
                 else {
@@ -4595,7 +4621,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
          * blocks at the beginning or the end unless we are explicitly
          * requested to avoid doing so.
          */
-       overflow = block & (sbi->s_cluster_ratio - 1);
+       overflow = EXT4_PBLK_COFF(sbi, block);
         if (overflow) {
                 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
                         overflow = sbi->s_cluster_ratio - overflow;
@@ -4609,7 +4635,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                         count += overflow;
                 }
         }
-       overflow = count & (sbi->s_cluster_ratio - 1);
+       overflow = EXT4_LBLK_COFF(sbi, count);
         if (overflow) {
                 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
                         if (count > overflow)
@@ -4633,7 +4659,7 @@ do_more:
                         EXT4_BLOCKS_PER_GROUP(sb);
                 count -= overflow;
         }
-       count_clusters = EXT4_B2C(sbi, count);
+       count_clusters = EXT4_NUM_B2C(sbi, count);
         bitmap_bh = ext4_read_block_bitmap(sb, block_group);
         if (!bitmap_bh) {
                 err = -EIO;
@@ -4690,12 +4716,12 @@ do_more:
                 /*
                  * blocks being freed are metadata. these blocks shouldn't
                  * be used until this transaction is committed
+                *
+                * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
+                * to fail.
                  */
-               new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
-               if (!new_entry) {
-                       err = -ENOMEM;
-                       goto error_return;
-               }
+               new_entry = kmem_cache_alloc(ext4_free_ext_cachep,
+                               GFP_NOFS|__GFP_NOFAIL);
                 new_entry->start_cluster = bit;
                 new_entry->group  = block_group;
                 new_entry->count = count_clusters;
@@ -4722,8 +4748,8 @@ do_more:
  
         if (sbi->s_log_groups_per_flex) {
                 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               atomic_add(count_clusters,
-                          &sbi->s_flex_groups[flex_group].free_clusters);
+               atomic64_add(count_clusters,
+                            &sbi->s_flex_groups[flex_group].free_clusters);
         }
  
         ext4_mb_unload_buddy(&e4b);
@@ -4863,12 +4889,12 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
         desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
         ext4_unlock_group(sb, block_group);
         percpu_counter_add(&sbi->s_freeclusters_counter,
-                          EXT4_B2C(sbi, blocks_freed));
+                          EXT4_NUM_B2C(sbi, blocks_freed));
  
         if (sbi->s_log_groups_per_flex) {
                 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               atomic_add(EXT4_B2C(sbi, blocks_freed),
-                          &sbi->s_flex_groups[flex_group].free_clusters);
+               atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
+                            &sbi->s_flex_groups[flex_group].free_clusters);
         }
  
         ext4_mb_unload_buddy(&e4b);