ext4: synchronize ext4_mb_init_group() with buddy page lock
[pandora-kernel.git] / fs / ext4 / mballoc.c
index d8a16ee..7311f25 100644 (file)
@@ -957,22 +957,21 @@ out:
 }
 
 /*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen  whild holding the buddy cache
- * lock
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
  */
-static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
-                                       ext4_group_t group)
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+               ext4_group_t group, struct ext4_buddy *e4b)
 {
-       int i;
-       int block, pnum;
+       struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+       int block, pnum, poff;
        int blocks_per_page;
-       int groups_per_page;
-       ext4_group_t ngroups = ext4_get_groups_count(sb);
-       ext4_group_t first_group;
-       struct ext4_group_info *grp;
+       struct page *page;
+
+       e4b->bd_buddy_page = NULL;
+       e4b->bd_bitmap_page = NULL;
 
        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
        /*
@@ -982,57 +981,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
         */
        block = group * 2;
        pnum = block / blocks_per_page;
-       first_group = pnum * blocks_per_page / 2;
-
-       groups_per_page = blocks_per_page >> 1;
-       if (groups_per_page == 0)
-               groups_per_page = 1;
-       /* read all groups the page covers into the cache */
-       for (i = 0; i < groups_per_page; i++) {
+       poff = block % blocks_per_page;
+       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+       if (!page)
+               return -EIO;
+       BUG_ON(page->mapping != inode->i_mapping);
+       e4b->bd_bitmap_page = page;
+       e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
 
-               if ((first_group + i) >= ngroups)
-                       break;
-               grp = ext4_get_group_info(sb, first_group + i);
-               /* take all groups write allocation
-                * semaphore. This make sure there is
-                * no block allocation going on in any
-                * of that groups
-                */
-               down_write_nested(&grp->alloc_sem, i);
+       if (blocks_per_page >= 2) {
+               /* buddy and bitmap are on the same page */
+               return 0;
        }
-       return i;
+
+       block++;
+       pnum = block / blocks_per_page;
+       poff = block % blocks_per_page;
+       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+       if (!page)
+               return -EIO;
+       BUG_ON(page->mapping != inode->i_mapping);
+       e4b->bd_buddy_page = page;
+       return 0;
 }
 
-static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
-                                        ext4_group_t group, int locked_group)
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
 {
-       int i;
-       int block, pnum;
-       int blocks_per_page;
-       ext4_group_t first_group;
-       struct ext4_group_info *grp;
-
-       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-       /*
-        * the buddy cache inode stores the block bitmap
-        * and buddy information in consecutive blocks.
-        * So for each group we need two blocks.
-        */
-       block = group * 2;
-       pnum = block / blocks_per_page;
-       first_group = pnum * blocks_per_page / 2;
-       /* release locks on all the groups */
-       for (i = 0; i < locked_group; i++) {
-
-               grp = ext4_get_group_info(sb, first_group + i);
-               /* take all groups write allocation
-                * semaphore. This make sure there is
-                * no block allocation going on in any
-                * of that groups
-                */
-               up_write(&grp->alloc_sem);
+       if (e4b->bd_bitmap_page) {
+               unlock_page(e4b->bd_bitmap_page);
+               page_cache_release(e4b->bd_bitmap_page);
+       }
+       if (e4b->bd_buddy_page) {
+               unlock_page(e4b->bd_buddy_page);
+               page_cache_release(e4b->bd_buddy_page);
        }
-
 }
 
 /*
@@ -1044,93 +1026,60 @@ static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
 
-       int ret = 0;
-       void *bitmap;
-       int blocks_per_page;
-       int block, pnum, poff;
-       int num_grp_locked = 0;
        struct ext4_group_info *this_grp;
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
-       struct inode *inode = sbi->s_buddy_cache;
-       struct page *page = NULL, *bitmap_page = NULL;
+       struct ext4_buddy e4b;
+       struct page *page;
+       int ret = 0;
 
        mb_debug(1, "init group %u\n", group);
-       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
        this_grp = ext4_get_group_info(sb, group);
        /*
         * This ensures that we don't reinit the buddy cache
         * page which map to the group from which we are already
         * allocating. If we are looking at the buddy cache we would
         * have taken a reference using ext4_mb_load_buddy and that
-        * would have taken the alloc_sem lock.
+        * would have pinned buddy page to page cache.
         */
-       num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
-       if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+       ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+       if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
                /*
                 * somebody initialized the group
                 * return without doing anything
                 */
-               ret = 0;
                goto err;
        }
-       /*
-        * the buddy cache inode stores the block bitmap
-        * and buddy information in consecutive blocks.
-        * So for each group we need two blocks.
-        */
-       block = group * 2;
-       pnum = block / blocks_per_page;
-       poff = block % blocks_per_page;
-       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-       if (page) {
-               BUG_ON(page->mapping != inode->i_mapping);
-               ret = ext4_mb_init_cache(page, NULL);
-               if (ret) {
-                       unlock_page(page);
-                       goto err;
-               }
-               unlock_page(page);
-       }
-       if (page == NULL || !PageUptodate(page)) {
+
+       page = e4b.bd_bitmap_page;
+       ret = ext4_mb_init_cache(page, NULL);
+       if (ret)
+               goto err;
+       if (!PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }
        mark_page_accessed(page);
-       bitmap_page = page;
-       bitmap = page_address(page) + (poff * sb->s_blocksize);
 
-       /* init buddy cache */
-       block++;
-       pnum = block / blocks_per_page;
-       poff = block % blocks_per_page;
-       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-       if (page == bitmap_page) {
+       if (e4b.bd_buddy_page == NULL) {
                /*
                 * If both the bitmap and buddy are in
                 * the same page we don't need to force
                 * init the buddy
                 */
-               unlock_page(page);
-       } else if (page) {
-               BUG_ON(page->mapping != inode->i_mapping);
-               ret = ext4_mb_init_cache(page, bitmap);
-               if (ret) {
-                       unlock_page(page);
-                       goto err;
-               }
-               unlock_page(page);
+               ret = 0;
+               goto err;
        }
-       if (page == NULL || !PageUptodate(page)) {
+       /* init buddy cache */
+       page = e4b.bd_buddy_page;
+       ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+       if (ret)
+               goto err;
+       if (!PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }
        mark_page_accessed(page);
 err:
-       ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
-       if (bitmap_page)
-               page_cache_release(bitmap_page);
-       if (page)
-               page_cache_release(page);
+       ext4_mb_put_buddy_page_lock(&e4b);
        return ret;
 }
 
@@ -1273,6 +1222,8 @@ repeat_load_buddy:
        return 0;
 
 err:
+       if (page)
+               page_cache_release(page);
        if (e4b->bd_bitmap_page)
                page_cache_release(e4b->bd_bitmap_page);
        if (e4b->bd_buddy_page)
@@ -2659,7 +2610,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        struct super_block *sb = journal->j_private;
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
-       int err, ret, count = 0, count2 = 0;
+       int err, count = 0, count2 = 0;
        struct ext4_free_data *entry;
        struct list_head *l, *ltmp;
 
@@ -2669,15 +2620,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
 
-               if (test_opt(sb, DISCARD)) {
-                       ret = ext4_issue_discard(sb, entry->group,
-                                       entry->start_blk, entry->count);
-                       if (unlikely(ret == -EOPNOTSUPP)) {
-                               ext4_warning(sb, "discard not supported, "
-                                                "disabling");
-                               clear_opt(sb, DISCARD);
-                       }
-               }
+               if (test_opt(sb, DISCARD))
+                       ext4_issue_discard(sb, entry->group,
+                                          entry->start_blk, entry->count);
 
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -4703,6 +4648,127 @@ error_return:
        return;
 }
 
+/**
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
+ * @handle:                    handle to this transaction
+ * @sb:                                super block
+ * @block:                     start physcial block to add to the block group
+ * @count:                     number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                        ext4_fsblk_t block, unsigned long count)
+{
+       struct buffer_head *bitmap_bh = NULL;
+       struct buffer_head *gd_bh;
+       ext4_group_t block_group;
+       ext4_grpblk_t bit;
+       unsigned int i;
+       struct ext4_group_desc *desc;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_buddy e4b;
+       int err = 0, ret, blk_free_count;
+       ext4_grpblk_t blocks_freed;
+       struct ext4_group_info *grp;
+
+       ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+       ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+       grp = ext4_get_group_info(sb, block_group);
+       /*
+        * Check to see if we are freeing blocks across a group
+        * boundary.
+        */
+       if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
+               goto error_return;
+
+       bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+       if (!bitmap_bh)
+               goto error_return;
+       desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+       if (!desc)
+               goto error_return;
+
+       if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+           in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+           in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+           in_range(block + count - 1, ext4_inode_table(sb, desc),
+                    sbi->s_itb_per_group)) {
+               ext4_error(sb, "Adding blocks in system zones - "
+                          "Block = %llu, count = %lu",
+                          block, count);
+               goto error_return;
+       }
+
+       BUFFER_TRACE(bitmap_bh, "getting write access");
+       err = ext4_journal_get_write_access(handle, bitmap_bh);
+       if (err)
+               goto error_return;
+
+       /*
+        * We are about to modify some metadata.  Call the journal APIs
+        * to unshare ->b_data if a currently-committing transaction is
+        * using it
+        */
+       BUFFER_TRACE(gd_bh, "get_write_access");
+       err = ext4_journal_get_write_access(handle, gd_bh);
+       if (err)
+               goto error_return;
+
+       for (i = 0, blocks_freed = 0; i < count; i++) {
+               BUFFER_TRACE(bitmap_bh, "clear bit");
+               if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+                       ext4_error(sb, "bit already cleared for block %llu",
+                                  (ext4_fsblk_t)(block + i));
+                       BUFFER_TRACE(bitmap_bh, "bit already cleared");
+               } else {
+                       blocks_freed++;
+               }
+       }
+
+       err = ext4_mb_load_buddy(sb, block_group, &e4b);
+       if (err)
+               goto error_return;
+
+       /*
+        * need to update group_info->bb_free and bitmap
+        * with group lock held. generate_buddy look at
+        * them with group lock_held
+        */
+       ext4_lock_group(sb, block_group);
+       mb_clear_bits(bitmap_bh->b_data, bit, count);
+       mb_free_blocks(NULL, &e4b, bit, count);
+       blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+       ext4_free_blks_set(sb, desc, blk_free_count);
+       desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+       ext4_unlock_group(sb, block_group);
+       percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+
+       if (sbi->s_log_groups_per_flex) {
+               ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+               atomic_add(blocks_freed,
+                          &sbi->s_flex_groups[flex_group].free_blocks);
+       }
+
+       ext4_mb_unload_buddy(&e4b);
+
+       /* We dirtied the bitmap block */
+       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+       err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+       /* And the group descriptor block */
+       BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+       ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+       if (!err)
+               err = ret;
+
+error_return:
+       brelse(bitmap_bh);
+       ext4_std_error(sb, err);
+       return;
+}
+
 /**
  * ext4_trim_extent -- function to TRIM one single free extent in the group
  * @sb:                super block for the file system
@@ -4715,11 +4781,10 @@ error_return:
  * one will allocate those blocks, mark it as used in buddy bitmap. This must
  * be called with under the group lock.
  */
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
-               ext4_group_t group, struct ext4_buddy *e4b)
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
+                            ext4_group_t group, struct ext4_buddy *e4b)
 {
        struct ext4_free_extent ex;
-       int ret = 0;
 
        assert_spin_locked(ext4_group_lock_ptr(sb, group));
 
@@ -4733,12 +4798,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
         */
        mb_mark_used(e4b, &ex);
        ext4_unlock_group(sb, group);
-
-       ret = ext4_issue_discard(sb, group, start, count);
-
+       ext4_issue_discard(sb, group, start, count);
        ext4_lock_group(sb, group);
        mb_free_blocks(NULL, e4b, start, ex.fe_len);
-       return ret;
 }
 
 /**
@@ -4766,7 +4828,6 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
        void *bitmap;
        ext4_grpblk_t next, count = 0;
        ext4_group_t group;
-       int ret = 0;
 
        BUG_ON(e4b == NULL);
 
@@ -4783,10 +4844,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
                next = mb_find_next_bit(bitmap, max, start);
 
                if ((next - start) >= minblocks) {
-                       ret = ext4_trim_extent(sb, start,
-                               next - start, group, e4b);
-                       if (ret < 0)
-                               break;
+                       ext4_trim_extent(sb, start,
+                                        next - start, group, e4b);
                        count += next - start;
                }
                start = next + 1;
@@ -4810,9 +4869,6 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
        ext4_debug("trimmed %d blocks in the group %d\n",
                count, group);
 
-       if (ret < 0)
-               count = ret;
-
        return count;
 }