ext4: trim allocation requests to group size
[pandora-kernel.git] / fs / ext4 / mballoc.c
index e2d8be8..5efe721 100644 (file)
@@ -652,7 +652,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
        ext4_grpblk_t min;
        ext4_grpblk_t max;
        ext4_grpblk_t chunk;
-       unsigned short border;
+       unsigned int border;
 
        BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
 
@@ -1312,6 +1312,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
        void *buddy2;
        struct super_block *sb = e4b->bd_sb;
 
+       if (WARN_ON(count == 0))
+               return;
        BUG_ON(first + count > (sb->s_blocksize << 3));
        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        mb_check_buddy(e4b);
@@ -2027,7 +2029,11 @@ repeat:
                group = ac->ac_g_ex.fe_group;
 
                for (i = 0; i < ngroups; group++, i++) {
-                       if (group == ngroups)
+                       /*
+                        * Artificially restricted ngroups for non-extent
+                        * files makes group > ngroups possible on first loop.
+                        */
+                       if (group >= ngroups)
                                group = 0;
 
                        /* This now checks without needing the buddy page */
@@ -2128,7 +2134,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
        struct ext4_buddy e4b;
        struct sg {
                struct ext4_group_info info;
-               ext4_grpblk_t counters[16];
+               ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
        } sg;
 
        group--;
@@ -2567,6 +2573,9 @@ int ext4_mb_release(struct super_block *sb)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
+       if (sbi->s_proc)
+               remove_proc_entry("mb_groups", sbi->s_proc);
+
        if (sbi->s_group_info) {
                for (i = 0; i < ngroups; i++) {
                        grinfo = ext4_get_group_info(sb, i);
@@ -2614,8 +2623,6 @@ int ext4_mb_release(struct super_block *sb)
        }
 
        free_percpu(sbi->s_locality_groups);
-       if (sbi->s_proc)
-               remove_proc_entry("mb_groups", sbi->s_proc);
 
        return 0;
 }
@@ -2817,7 +2824,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
                           "fs metadata\n", block, block+len);
                /* File system mounted not to panic on error
-                * Fix the bitmap and repeat the block allocation
+                * Fix the bitmap and return EIO
                 * We leak some of the blocks here.
                 */
                ext4_lock_group(sb, ac->ac_b_ex.fe_group);
@@ -2826,7 +2833,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!err)
-                       err = -EAGAIN;
+                       err = -EIO;
                goto out_err;
        }
 
@@ -2865,8 +2872,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi,
                                                          ac->ac_b_ex.fe_group);
-               atomic_sub(ac->ac_b_ex.fe_len,
-                          &sbi->s_flex_groups[flex_group].free_clusters);
+               atomic64_sub(ac->ac_b_ex.fe_len,
+                            &sbi->s_flex_groups[flex_group].free_clusters);
        }
 
        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -2996,6 +3003,13 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        if (ar->pright && start + size - 1 >= ar->lright)
                size -= start + size - ar->lright;
 
+       /*
+        * Trim allocation request for filesystems with artificially small
+        * groups.
+        */
+       if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
+               size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
+
        end = start + size;
 
        /* check we don't cross already preallocated blocks */
@@ -3062,7 +3076,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        }
        BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
                        start > ac->ac_o_ex.fe_logical);
-       BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+       BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
 
        /* now prepare goal request */
 
@@ -3123,13 +3137,31 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
 {
        struct ext4_prealloc_space *pa = ac->ac_pa;
-       int len;
+       struct ext4_buddy e4b;
+       int err;
 
-       if (pa && pa->pa_type == MB_INODE_PA) {
-               len = ac->ac_b_ex.fe_len;
-               pa->pa_free += len;
+       if (pa == NULL) {
+               if (ac->ac_f_ex.fe_len == 0)
+                       return;
+               err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
+               if (err) {
+                       /*
+                        * This should never happen since we pin the
+                        * pages in the ext4_allocation_context so
+                        * ext4_mb_load_buddy() should never fail.
+                        */
+                       WARN(1, "mb_load_buddy failed (%d)", err);
+                       return;
+               }
+               ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
+               mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
+                              ac->ac_f_ex.fe_len);
+               ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
+               ext4_mb_unload_buddy(&e4b);
+               return;
        }
-
+       if (pa->pa_type == MB_INODE_PA)
+               pa->pa_free += ac->ac_b_ex.fe_len;
 }
 
 /*
@@ -3373,6 +3405,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
 {
        struct ext4_prealloc_space *pa;
        pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+
+       BUG_ON(atomic_read(&pa->pa_count));
+       BUG_ON(pa->pa_deleted == 0);
        kmem_cache_free(ext4_pspace_cachep, pa);
 }
 
@@ -3386,11 +3421,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
        ext4_group_t grp;
        ext4_fsblk_t grp_blk;
 
-       if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
-               return;
-
        /* in this short window concurrent discard can set pa_deleted */
        spin_lock(&pa->pa_lock);
+       if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
+               spin_unlock(&pa->pa_lock);
+               return;
+       }
+
        if (pa->pa_deleted == 1) {
                spin_unlock(&pa->pa_lock);
                return;
@@ -3484,7 +3521,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
                        win = offs;
 
                ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
-                       EXT4_B2C(sbi, win);
+                       EXT4_NUM_B2C(sbi, win);
                BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
                BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
        }
@@ -4057,7 +4094,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 
        /* set up allocation goals */
        memset(ac, 0, sizeof(struct ext4_allocation_context));
-       ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
+       ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
        ac->ac_status = AC_STATUS_CONTINUE;
        ac->ac_sb = sb;
        ac->ac_inode = ar->inode;
@@ -4177,7 +4214,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
                /* The max size of hash table is PREALLOC_TB_SIZE */
                order = PREALLOC_TB_SIZE - 1;
        /* Add the prealloc space to lg */
-       rcu_read_lock();
+       spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
                                                pa_inode_list) {
                spin_lock(&tmp_pa->pa_lock);
@@ -4201,12 +4238,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
        if (!added)
                list_add_tail_rcu(&pa->pa_inode_list,
                                        &lg->lg_prealloc_list[order]);
-       rcu_read_unlock();
+       spin_unlock(&lg->lg_prealloc_lock);
 
        /* Now trim the list to be not more than 8 elements */
        if (lg_prealloc_count > 8) {
                ext4_mb_discard_lg_preallocations(sb, lg,
-                                               order, lg_prealloc_count);
+                                                 order, lg_prealloc_count);
                return;
        }
        return ;
@@ -4371,18 +4408,7 @@ repeat:
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
-               if (*errp == -EAGAIN) {
-                       /*
-                        * drop the reference that we took
-                        * in ext4_mb_use_best_found
-                        */
-                       ext4_mb_release_context(ac);
-                       ac->ac_b_ex.fe_group = 0;
-                       ac->ac_b_ex.fe_start = 0;
-                       ac->ac_b_ex.fe_len = 0;
-                       ac->ac_status = AC_STATUS_CONTINUE;
-                       goto repeat;
-               } else if (*errp)
+               if (*errp)
                errout:
                        ext4_discard_allocated_blocks(ac);
                else {
@@ -4595,7 +4621,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
         * blocks at the beginning or the end unless we are explicitly
         * requested to avoid doing so.
         */
-       overflow = block & (sbi->s_cluster_ratio - 1);
+       overflow = EXT4_PBLK_COFF(sbi, block);
        if (overflow) {
                if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
                        overflow = sbi->s_cluster_ratio - overflow;
@@ -4609,7 +4635,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        count += overflow;
                }
        }
-       overflow = count & (sbi->s_cluster_ratio - 1);
+       overflow = EXT4_LBLK_COFF(sbi, count);
        if (overflow) {
                if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
                        if (count > overflow)
@@ -4633,7 +4659,7 @@ do_more:
                        EXT4_BLOCKS_PER_GROUP(sb);
                count -= overflow;
        }
-       count_clusters = EXT4_B2C(sbi, count);
+       count_clusters = EXT4_NUM_B2C(sbi, count);
        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh) {
                err = -EIO;
@@ -4690,12 +4716,12 @@ do_more:
                /*
                 * blocks being freed are metadata. these blocks shouldn't
                 * be used until this transaction is committed
+                *
+                * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
+                * to fail.
                 */
-               new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
-               if (!new_entry) {
-                       err = -ENOMEM;
-                       goto error_return;
-               }
+               new_entry = kmem_cache_alloc(ext4_free_ext_cachep,
+                               GFP_NOFS|__GFP_NOFAIL);
                new_entry->start_cluster = bit;
                new_entry->group  = block_group;
                new_entry->count = count_clusters;
@@ -4722,8 +4748,8 @@ do_more:
 
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               atomic_add(count_clusters,
-                          &sbi->s_flex_groups[flex_group].free_clusters);
+               atomic64_add(count_clusters,
+                            &sbi->s_flex_groups[flex_group].free_clusters);
        }
 
        ext4_mb_unload_buddy(&e4b);
@@ -4863,12 +4889,12 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
        ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeclusters_counter,
-                          EXT4_B2C(sbi, blocks_freed));
+                          EXT4_NUM_B2C(sbi, blocks_freed));
 
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               atomic_add(EXT4_B2C(sbi, blocks_freed),
-                          &sbi->s_flex_groups[flex_group].free_clusters);
+               atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
+                            &sbi->s_flex_groups[flex_group].free_clusters);
        }
 
        ext4_mb_unload_buddy(&e4b);