ext4: synchronize ext4_mb_init_group() with buddy page lock
[pandora-kernel.git] / fs / ext4 / mballoc.c
index d1fe09a..7311f25 100644 (file)
@@ -92,7 +92,7 @@
  * between CPUs. It is possible to get scheduled at this point.
  *
  * The locality group prealloc space is used looking at whether we have
- * enough free space (pa_free) withing the prealloc space.
+ * enough free space (pa_free) within the prealloc space.
  *
  * If we can't allocate blocks via inode prealloc or/and locality group
  * prealloc then we look at the buddy cache. The buddy cache is represented
@@ -432,9 +432,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
        }
 
        /* at order 0 we see each particular block */
-       *max = 1 << (e4b->bd_blkbits + 3);
-       if (order == 0)
+       if (order == 0) {
+               *max = 1 << (e4b->bd_blkbits + 3);
                return EXT4_MB_BITMAP(e4b);
+       }
 
        bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
@@ -616,7 +617,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
 
        grp = ext4_get_group_info(sb, e4b->bd_group);
-       buddy = mb_find_buddy(e4b, 0, &max);
        list_for_each(cur, &grp->bb_prealloc_list) {
                ext4_group_t groupnr;
                struct ext4_prealloc_space *pa;
@@ -635,7 +635,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 #define mb_check_buddy(e4b)
 #endif
 
-/* FIXME!! need more doc */
+/*
+ * Divide blocks started from @first with length @len into
+ * smaller chunks with power of 2 blocks.
+ * Clear the bits in bitmap which the blocks of the chunk(s) covered,
+ * then increase bb_counters[] for corresponded chunk size.
+ */
 static void ext4_mb_mark_free_simple(struct super_block *sb,
                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
                                        struct ext4_group_info *grp)
@@ -952,22 +957,21 @@ out:
 }
 
 /*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen  whild holding the buddy cache
- * lock
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
  */
-static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
-                                       ext4_group_t group)
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+               ext4_group_t group, struct ext4_buddy *e4b)
 {
-       int i;
-       int block, pnum;
+       struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+       int block, pnum, poff;
        int blocks_per_page;
-       int groups_per_page;
-       ext4_group_t ngroups = ext4_get_groups_count(sb);
-       ext4_group_t first_group;
-       struct ext4_group_info *grp;
+       struct page *page;
+
+       e4b->bd_buddy_page = NULL;
+       e4b->bd_bitmap_page = NULL;
 
        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
        /*
@@ -977,57 +981,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
         */
        block = group * 2;
        pnum = block / blocks_per_page;
-       first_group = pnum * blocks_per_page / 2;
-
-       groups_per_page = blocks_per_page >> 1;
-       if (groups_per_page == 0)
-               groups_per_page = 1;
-       /* read all groups the page covers into the cache */
-       for (i = 0; i < groups_per_page; i++) {
+       poff = block % blocks_per_page;
+       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+       if (!page)
+               return -EIO;
+       BUG_ON(page->mapping != inode->i_mapping);
+       e4b->bd_bitmap_page = page;
+       e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
 
-               if ((first_group + i) >= ngroups)
-                       break;
-               grp = ext4_get_group_info(sb, first_group + i);
-               /* take all groups write allocation
-                * semaphore. This make sure there is
-                * no block allocation going on in any
-                * of that groups
-                */
-               down_write_nested(&grp->alloc_sem, i);
+       if (blocks_per_page >= 2) {
+               /* buddy and bitmap are on the same page */
+               return 0;
        }
-       return i;
+
+       block++;
+       pnum = block / blocks_per_page;
+       poff = block % blocks_per_page;
+       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+       if (!page)
+               return -EIO;
+       BUG_ON(page->mapping != inode->i_mapping);
+       e4b->bd_buddy_page = page;
+       return 0;
 }
 
-static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
-                                        ext4_group_t group, int locked_group)
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
 {
-       int i;
-       int block, pnum;
-       int blocks_per_page;
-       ext4_group_t first_group;
-       struct ext4_group_info *grp;
-
-       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-       /*
-        * the buddy cache inode stores the block bitmap
-        * and buddy information in consecutive blocks.
-        * So for each group we need two blocks.
-        */
-       block = group * 2;
-       pnum = block / blocks_per_page;
-       first_group = pnum * blocks_per_page / 2;
-       /* release locks on all the groups */
-       for (i = 0; i < locked_group; i++) {
-
-               grp = ext4_get_group_info(sb, first_group + i);
-               /* take all groups write allocation
-                * semaphore. This make sure there is
-                * no block allocation going on in any
-                * of that groups
-                */
-               up_write(&grp->alloc_sem);
+       if (e4b->bd_bitmap_page) {
+               unlock_page(e4b->bd_bitmap_page);
+               page_cache_release(e4b->bd_bitmap_page);
+       }
+       if (e4b->bd_buddy_page) {
+               unlock_page(e4b->bd_buddy_page);
+               page_cache_release(e4b->bd_buddy_page);
        }
-
 }
 
 /*
@@ -1039,93 +1026,60 @@ static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
 
-       int ret = 0;
-       void *bitmap;
-       int blocks_per_page;
-       int block, pnum, poff;
-       int num_grp_locked = 0;
        struct ext4_group_info *this_grp;
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
-       struct inode *inode = sbi->s_buddy_cache;
-       struct page *page = NULL, *bitmap_page = NULL;
+       struct ext4_buddy e4b;
+       struct page *page;
+       int ret = 0;
 
        mb_debug(1, "init group %u\n", group);
-       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
        this_grp = ext4_get_group_info(sb, group);
        /*
         * This ensures that we don't reinit the buddy cache
         * page which map to the group from which we are already
         * allocating. If we are looking at the buddy cache we would
         * have taken a reference using ext4_mb_load_buddy and that
-        * would have taken the alloc_sem lock.
+        * would have pinned buddy page to page cache.
         */
-       num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
-       if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+       ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+       if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
                /*
                 * somebody initialized the group
                 * return without doing anything
                 */
-               ret = 0;
                goto err;
        }
-       /*
-        * the buddy cache inode stores the block bitmap
-        * and buddy information in consecutive blocks.
-        * So for each group we need two blocks.
-        */
-       block = group * 2;
-       pnum = block / blocks_per_page;
-       poff = block % blocks_per_page;
-       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-       if (page) {
-               BUG_ON(page->mapping != inode->i_mapping);
-               ret = ext4_mb_init_cache(page, NULL);
-               if (ret) {
-                       unlock_page(page);
-                       goto err;
-               }
-               unlock_page(page);
-       }
-       if (page == NULL || !PageUptodate(page)) {
+
+       page = e4b.bd_bitmap_page;
+       ret = ext4_mb_init_cache(page, NULL);
+       if (ret)
+               goto err;
+       if (!PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }
        mark_page_accessed(page);
-       bitmap_page = page;
-       bitmap = page_address(page) + (poff * sb->s_blocksize);
 
-       /* init buddy cache */
-       block++;
-       pnum = block / blocks_per_page;
-       poff = block % blocks_per_page;
-       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-       if (page == bitmap_page) {
+       if (e4b.bd_buddy_page == NULL) {
                /*
                 * If both the bitmap and buddy are in
                 * the same page we don't need to force
                 * init the buddy
                 */
-               unlock_page(page);
-       } else if (page) {
-               BUG_ON(page->mapping != inode->i_mapping);
-               ret = ext4_mb_init_cache(page, bitmap);
-               if (ret) {
-                       unlock_page(page);
-                       goto err;
-               }
-               unlock_page(page);
+               ret = 0;
+               goto err;
        }
-       if (page == NULL || !PageUptodate(page)) {
+       /* init buddy cache */
+       page = e4b.bd_buddy_page;
+       ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+       if (ret)
+               goto err;
+       if (!PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }
        mark_page_accessed(page);
 err:
-       ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
-       if (bitmap_page)
-               page_cache_release(bitmap_page);
-       if (page)
-               page_cache_release(page);
+       ext4_mb_put_buddy_page_lock(&e4b);
        return ret;
 }
 
@@ -1268,6 +1222,8 @@ repeat_load_buddy:
        return 0;
 
 err:
+       if (page)
+               page_cache_release(page);
        if (e4b->bd_bitmap_page)
                page_cache_release(e4b->bd_bitmap_page);
        if (e4b->bd_buddy_page)
@@ -2381,7 +2337,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
         * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
         * So a two level scheme suffices for now. */
-       sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
+       sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
        if (sbi->s_group_info == NULL) {
                printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
                return -ENOMEM;
@@ -2654,7 +2610,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        struct super_block *sb = journal->j_private;
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
-       int err, ret, count = 0, count2 = 0;
+       int err, count = 0, count2 = 0;
        struct ext4_free_data *entry;
        struct list_head *l, *ltmp;
 
@@ -2664,15 +2620,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
 
-               if (test_opt(sb, DISCARD)) {
-                       ret = ext4_issue_discard(sb, entry->group,
-                                       entry->start_blk, entry->count);
-                       if (unlikely(ret == -EOPNOTSUPP)) {
-                               ext4_warning(sb, "discard not supported, "
-                                                "disabling");
-                               clear_opt(sb, DISCARD);
-                       }
-               }
+               if (test_opt(sb, DISCARD))
+                       ext4_issue_discard(sb, entry->group,
+                                          entry->start_blk, entry->count);
 
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -3208,7 +3158,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
        cur_distance = abs(goal_block - cpa->pa_pstart);
        new_distance = abs(goal_block - pa->pa_pstart);
 
-       if (cur_distance < new_distance)
+       if (cur_distance <= new_distance)
                return cpa;
 
        /* drop the previous reference */
@@ -3907,7 +3857,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
        struct super_block *sb = ac->ac_sb;
        ext4_group_t ngroups, i;
 
-       if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+       if (!mb_enable_debug ||
+           (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
                return;
 
        printk(KERN_ERR "EXT4-fs: Can't allocate:"
@@ -4697,6 +4648,127 @@ error_return:
        return;
 }
 
+/**
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
+ * @handle:                    handle to this transaction
+ * @sb:                                super block
+ * @block:                     start physcial block to add to the block group
+ * @count:                     number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                        ext4_fsblk_t block, unsigned long count)
+{
+       struct buffer_head *bitmap_bh = NULL;
+       struct buffer_head *gd_bh;
+       ext4_group_t block_group;
+       ext4_grpblk_t bit;
+       unsigned int i;
+       struct ext4_group_desc *desc;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_buddy e4b;
+       int err = 0, ret, blk_free_count;
+       ext4_grpblk_t blocks_freed;
+       struct ext4_group_info *grp;
+
+       ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+       ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+       grp = ext4_get_group_info(sb, block_group);
+       /*
+        * Check to see if we are freeing blocks across a group
+        * boundary.
+        */
+       if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
+               goto error_return;
+
+       bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+       if (!bitmap_bh)
+               goto error_return;
+       desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+       if (!desc)
+               goto error_return;
+
+       if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+           in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+           in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+           in_range(block + count - 1, ext4_inode_table(sb, desc),
+                    sbi->s_itb_per_group)) {
+               ext4_error(sb, "Adding blocks in system zones - "
+                          "Block = %llu, count = %lu",
+                          block, count);
+               goto error_return;
+       }
+
+       BUFFER_TRACE(bitmap_bh, "getting write access");
+       err = ext4_journal_get_write_access(handle, bitmap_bh);
+       if (err)
+               goto error_return;
+
+       /*
+        * We are about to modify some metadata.  Call the journal APIs
+        * to unshare ->b_data if a currently-committing transaction is
+        * using it
+        */
+       BUFFER_TRACE(gd_bh, "get_write_access");
+       err = ext4_journal_get_write_access(handle, gd_bh);
+       if (err)
+               goto error_return;
+
+       for (i = 0, blocks_freed = 0; i < count; i++) {
+               BUFFER_TRACE(bitmap_bh, "clear bit");
+               if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+                       ext4_error(sb, "bit already cleared for block %llu",
+                                  (ext4_fsblk_t)(block + i));
+                       BUFFER_TRACE(bitmap_bh, "bit already cleared");
+               } else {
+                       blocks_freed++;
+               }
+       }
+
+       err = ext4_mb_load_buddy(sb, block_group, &e4b);
+       if (err)
+               goto error_return;
+
+       /*
+        * need to update group_info->bb_free and bitmap
+        * with group lock held. generate_buddy look at
+        * them with group lock_held
+        */
+       ext4_lock_group(sb, block_group);
+       mb_clear_bits(bitmap_bh->b_data, bit, count);
+       mb_free_blocks(NULL, &e4b, bit, count);
+       blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+       ext4_free_blks_set(sb, desc, blk_free_count);
+       desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+       ext4_unlock_group(sb, block_group);
+       percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+
+       if (sbi->s_log_groups_per_flex) {
+               ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+               atomic_add(blocks_freed,
+                          &sbi->s_flex_groups[flex_group].free_blocks);
+       }
+
+       ext4_mb_unload_buddy(&e4b);
+
+       /* We dirtied the bitmap block */
+       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+       err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+       /* And the group descriptor block */
+       BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+       ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+       if (!err)
+               err = ret;
+
+error_return:
+       brelse(bitmap_bh);
+       ext4_std_error(sb, err);
+       return;
+}
+
 /**
  * ext4_trim_extent -- function to TRIM one single free extent in the group
  * @sb:                super block for the file system
@@ -4709,11 +4781,10 @@ error_return:
  * one will allocate those blocks, mark it as used in buddy bitmap. This must
  * be called with under the group lock.
  */
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
-               ext4_group_t group, struct ext4_buddy *e4b)
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
+                            ext4_group_t group, struct ext4_buddy *e4b)
 {
        struct ext4_free_extent ex;
-       int ret = 0;
 
        assert_spin_locked(ext4_group_lock_ptr(sb, group));
 
@@ -4727,12 +4798,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
         */
        mb_mark_used(e4b, &ex);
        ext4_unlock_group(sb, group);
-
-       ret = ext4_issue_discard(sb, group, start, count);
-
+       ext4_issue_discard(sb, group, start, count);
        ext4_lock_group(sb, group);
        mb_free_blocks(NULL, e4b, start, ex.fe_len);
-       return ret;
 }
 
 /**
@@ -4753,13 +4821,13 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
  * bitmap. Then issue a TRIM command on this extent and free the extent in
  * the group buddy bitmap. This is done until whole group is scanned.
  */
-ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
                ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
 {
        void *bitmap;
        ext4_grpblk_t next, count = 0;
        ext4_group_t group;
-       int ret = 0;
 
        BUG_ON(e4b == NULL);
 
@@ -4776,10 +4844,8 @@ ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
                next = mb_find_next_bit(bitmap, max, start);
 
                if ((next - start) >= minblocks) {
-                       ret = ext4_trim_extent(sb, start,
-                               next - start, group, e4b);
-                       if (ret < 0)
-                               break;
+                       ext4_trim_extent(sb, start,
+                                        next - start, group, e4b);
                        count += next - start;
                }
                start = next + 1;
@@ -4803,9 +4869,6 @@ ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
        ext4_debug("trimmed %d blocks in the group %d\n",
                count, group);
 
-       if (ret < 0)
-               count = ret;
-
        return count;
 }
 
@@ -4863,10 +4926,15 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                        break;
                }
 
-               if (len >= EXT4_BLOCKS_PER_GROUP(sb))
-                       len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
-               else
+               /*
+                * For all the groups except the last one, last block will
+                * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
+                * change it for the last group in which case start +
+                * len < EXT4_BLOCKS_PER_GROUP(sb).
+                */
+               if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
                        last_block = first_block + len;
+               len -= last_block - first_block;
 
                if (e4b.bd_info->bb_free >= minlen) {
                        cnt = ext4_trim_all_free(sb, &e4b, first_block,