Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 2 Nov 2011 17:06:20 +0000 (10:06 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 2 Nov 2011 17:06:20 +0000 (10:06 -0700)
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (97 commits)
  jbd2: Unify log messages in jbd2 code
  jbd/jbd2: validate sb->s_first in journal_get_superblock()
  ext4: let ext4_ext_rm_leaf work with EXT_DEBUG defined
  ext4: fix a syntax error in ext4_ext_insert_extent when debugging enabled
  ext4: fix a typo in struct ext4_allocation_context
  ext4: Don't normalize an falloc request if it can fit in 1 extent.
  ext4: remove comments about extent mount option in ext4_new_inode()
  ext4: let ext4_discard_partial_buffers handle unaligned range correctly
  ext4: return ENOMEM if find_or_create_pages fails
  ext4: move vars to local scope in ext4_discard_partial_page_buffers_no_lock()
  ext4: Create helper function for EXT4_IO_END_UNWRITTEN and i_aiodio_unwritten
  ext4: optimize locking for end_io extent conversion
  ext4: remove unnecessary call to waitqueue_active()
  ext4: Use correct locking for ext4_end_io_nolock()
  ext4: fix race in xattr block allocation path
  ext4: trace punch_hole correctly in ext4_ext_map_blocks
  ext4: clean up AGGRESSIVE_TEST code
  ext4: move variables to their scope
  ext4: fix quota accounting during migration
  ext4: migrate cleanup
  ...

34 files changed:
Documentation/filesystems/ext4.txt
fs/ext4/balloc.c
fs/ext4/ext4.h
fs/ext4/ext4_extents.h
fs/ext4/ext4_jbd2.c
fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/fsync.c
fs/ext4/ialloc.c
fs/ext4/indirect.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/mballoc.c
fs/ext4/mballoc.h
fs/ext4/migrate.c
fs/ext4/mmp.c
fs/ext4/move_extent.c
fs/ext4/namei.c
fs/ext4/page-io.c
fs/ext4/resize.c
fs/ext4/super.c
fs/ext4/xattr.c
fs/jbd/journal.c
fs/jbd2/commit.c
fs/jbd2/journal.c
fs/jbd2/recovery.c
fs/jbd2/transaction.c
include/linux/ext2_fs.h
include/linux/ext3_fs.h
include/linux/fs.h
include/linux/jbd.h
include/linux/jbd2.h
include/linux/jbd_common.h [new file with mode: 0644]
include/trace/events/ext4.h

index 232a575..4917cf2 100644 (file)
@@ -160,7 +160,9 @@ noload                      if the filesystem was not unmounted cleanly,
                        lead to any number of problems.
 
 data=journal           All data are committed into the journal prior to being
-                       written into the main file system.
+                       written into the main file system.  Enabling
+                       this mode will disable delayed allocation and
+                       O_DIRECT support.
 
 data=ordered   (*)     All data are forced directly out to the main file
                        system prior to its metadata being committed to the
@@ -201,30 +203,19 @@ inode_readahead_blks=n    This tuning parameter controls the maximum
                        table readahead algorithm will pre-read into
                        the buffer cache.  The default value is 32 blocks.
 
-orlov          (*)     This enables the new Orlov block allocator. It is
-                       enabled by default.
-
-oldalloc               This disables the Orlov block allocator and enables
-                       the old block allocator.  Orlov should have better
-                       performance - we'd like to get some feedback if it's
-                       the contrary for you.
-
-user_xattr             Enables Extended User Attributes.  Additionally, you
-                       need to have extended attribute support enabled in the
-                       kernel configuration (CONFIG_EXT4_FS_XATTR).  See the
-                       attr(5) manual page and http://acl.bestbits.at/ to
-                       learn more about extended attributes.
-
-nouser_xattr           Disables Extended User Attributes.
-
-acl                    Enables POSIX Access Control Lists support.
-                       Additionally, you need to have ACL support enabled in
-                       the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL).
-                       See the acl(5) manual page and http://acl.bestbits.at/
-                       for more information.
+nouser_xattr           Disables Extended User Attributes. If you have extended
+                       attribute support enabled in the kernel configuration
+                       (CONFIG_EXT4_FS_XATTR), extended attribute support
+                       is enabled by default on mount. See the attr(5) manual
+                       page and http://acl.bestbits.at/ for more information
+                       about extended attributes.
 
 noacl                  This option disables POSIX Access Control List
-                       support.
+                       support. If ACL support is enabled in the kernel
+                       configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL is
+                       enabled by default on mount. See the acl(5) manual
+                       page and http://acl.bestbits.at/ for more information
+                       about acl.
 
 bsddf          (*)     Make 'df' act like BSD.
 minixdf                        Make 'df' act like Minix.
@@ -419,8 +410,8 @@ written to the journal first, and then to its final location.
 In the event of a crash, the journal can be replayed, bringing both data and
 metadata into a consistent state.  This mode is the slowest except when data
 needs to be read from and written to disk at the same time where it
-outperforms all others modes.  Currently ext4 does not have delayed
-allocation support if this data journalling mode is selected.
+outperforms all others modes.  Enabling this mode will disable delayed
+allocation and O_DIRECT support.
 
 /proc entries
 =============
index f8224ad..f6dba45 100644 (file)
@@ -28,7 +28,8 @@
  */
 
 /*
- * Calculate the block group number and offset, given a block number
+ * Calculate the block group number and offset into the block/cluster
+ * allocation bitmap, given a block number
  */
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
                ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
@@ -37,7 +38,8 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
        ext4_grpblk_t offset;
 
        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
-       offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
+       offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
+               EXT4_SB(sb)->s_cluster_bits;
        if (offsetp)
                *offsetp = offset;
        if (blockgrpp)
@@ -55,130 +57,169 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
        return 0;
 }
 
-static int ext4_group_used_meta_blocks(struct super_block *sb,
-                                      ext4_group_t block_group,
-                                      struct ext4_group_desc *gdp)
+/* Return the number of clusters used for file system metadata; this
+ * represents the overhead needed by the file system.
+ */
+unsigned ext4_num_overhead_clusters(struct super_block *sb,
+                                   ext4_group_t block_group,
+                                   struct ext4_group_desc *gdp)
 {
-       ext4_fsblk_t tmp;
+       unsigned num_clusters;
+       int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
+       ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
+       ext4_fsblk_t itbl_blk;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-       /* block bitmap, inode bitmap, and inode table blocks */
-       int used_blocks = sbi->s_itb_per_group + 2;
 
-       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
-               if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
-                                       block_group))
-                       used_blocks--;
-
-               if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp),
-                                       block_group))
-                       used_blocks--;
-
-               tmp = ext4_inode_table(sb, gdp);
-               for (; tmp < ext4_inode_table(sb, gdp) +
-                               sbi->s_itb_per_group; tmp++) {
-                       if (!ext4_block_in_group(sb, tmp, block_group))
-                               used_blocks -= 1;
+       /* This is the number of clusters used by the superblock,
+        * block group descriptors, and reserved block group
+        * descriptor blocks */
+       num_clusters = ext4_num_base_meta_clusters(sb, block_group);
+
+       /*
+        * For the allocation bitmaps and inode table, we first need
+        * to check to see if the block is in the block group.  If it
+        * is, then check to see if the cluster is already accounted
+        * for in the clusters used for the base metadata cluster, or
+        * if we can increment the base metadata cluster to include
+        * that block.  Otherwise, we will have to track the cluster
+        * used for the allocation bitmap or inode table explicitly.
+        * Normally all of these blocks are contiguous, so the special
+        * case handling shouldn't be necessary except for *very*
+        * unusual file system layouts.
+        */
+       if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
+               block_cluster = EXT4_B2C(sbi, (start -
+                                              ext4_block_bitmap(sb, gdp)));
+               if (block_cluster < num_clusters)
+                       block_cluster = -1;
+               else if (block_cluster == num_clusters) {
+                       num_clusters++;
+                       block_cluster = -1;
                }
        }
-       return used_blocks;
-}
 
-/* Initializes an uninitialized block bitmap if given, and returns the
- * number of blocks free in the group. */
-unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
-                ext4_group_t block_group, struct ext4_group_desc *gdp)
-{
-       int bit, bit_max;
-       ext4_group_t ngroups = ext4_get_groups_count(sb);
-       unsigned free_blocks, group_blocks;
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
-
-       if (bh) {
-               J_ASSERT_BH(bh, buffer_locked(bh));
-
-               /* If checksum is bad mark all blocks used to prevent allocation
-                * essentially implementing a per-group read-only flag. */
-               if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-                       ext4_error(sb, "Checksum bad for group %u",
-                                       block_group);
-                       ext4_free_blks_set(sb, gdp, 0);
-                       ext4_free_inodes_set(sb, gdp, 0);
-                       ext4_itable_unused_set(sb, gdp, 0);
-                       memset(bh->b_data, 0xff, sb->s_blocksize);
-                       return 0;
+       if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
+               inode_cluster = EXT4_B2C(sbi,
+                                        start - ext4_inode_bitmap(sb, gdp));
+               if (inode_cluster < num_clusters)
+                       inode_cluster = -1;
+               else if (inode_cluster == num_clusters) {
+                       num_clusters++;
+                       inode_cluster = -1;
                }
-               memset(bh->b_data, 0, sb->s_blocksize);
        }
 
-       /* Check for superblock and gdt backups in this group */
-       bit_max = ext4_bg_has_super(sb, block_group);
-
-       if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
-           block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
-                         sbi->s_desc_per_block) {
-               if (bit_max) {
-                       bit_max += ext4_bg_num_gdb(sb, block_group);
-                       bit_max +=
-                               le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+       itbl_blk = ext4_inode_table(sb, gdp);
+       for (i = 0; i < sbi->s_itb_per_group; i++) {
+               if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
+                       c = EXT4_B2C(sbi, start - itbl_blk + i);
+                       if ((c < num_clusters) || (c == inode_cluster) ||
+                           (c == block_cluster) || (c == itbl_cluster))
+                               continue;
+                       if (c == num_clusters) {
+                               num_clusters++;
+                               continue;
+                       }
+                       num_clusters++;
+                       itbl_cluster = c;
                }
-       } else { /* For META_BG_BLOCK_GROUPS */
-               bit_max += ext4_bg_num_gdb(sb, block_group);
        }
 
-       if (block_group == ngroups - 1) {
+       if (block_cluster != -1)
+               num_clusters++;
+       if (inode_cluster != -1)
+               num_clusters++;
+
+       return num_clusters;
+}
+
+static unsigned int num_clusters_in_group(struct super_block *sb,
+                                         ext4_group_t block_group)
+{
+       unsigned int blocks;
+
+       if (block_group == ext4_get_groups_count(sb) - 1) {
                /*
-                * Even though mke2fs always initialize first and last group
-                * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
-                * to make sure we calculate the right free blocks
+                * Even though mke2fs always initializes the first and
+                * last group, just in case some other tool was used,
+                * we need to make sure we calculate the right free
+                * blocks.
                 */
-               group_blocks = ext4_blocks_count(sbi->s_es) -
-                       ext4_group_first_block_no(sb, ngroups - 1);
-       } else {
-               group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
-       }
+               blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
+                       ext4_group_first_block_no(sb, block_group);
+       } else
+               blocks = EXT4_BLOCKS_PER_GROUP(sb);
+       return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
+}
 
-       free_blocks = group_blocks - bit_max;
+/* Initializes an uninitialized block bitmap */
+void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
+                           ext4_group_t block_group,
+                           struct ext4_group_desc *gdp)
+{
+       unsigned int bit, bit_max;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       ext4_fsblk_t start, tmp;
+       int flex_bg = 0;
+
+       J_ASSERT_BH(bh, buffer_locked(bh));
+
+       /* If checksum is bad mark all blocks used to prevent allocation
+        * essentially implementing a per-group read-only flag. */
+       if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+               ext4_error(sb, "Checksum bad for group %u", block_group);
+               ext4_free_group_clusters_set(sb, gdp, 0);
+               ext4_free_inodes_set(sb, gdp, 0);
+               ext4_itable_unused_set(sb, gdp, 0);
+               memset(bh->b_data, 0xff, sb->s_blocksize);
+               return;
+       }
+       memset(bh->b_data, 0, sb->s_blocksize);
 
-       if (bh) {
-               ext4_fsblk_t start, tmp;
-               int flex_bg = 0;
+       bit_max = ext4_num_base_meta_clusters(sb, block_group);
+       for (bit = 0; bit < bit_max; bit++)
+               ext4_set_bit(bit, bh->b_data);
 
-               for (bit = 0; bit < bit_max; bit++)
-                       ext4_set_bit(bit, bh->b_data);
+       start = ext4_group_first_block_no(sb, block_group);
 
-               start = ext4_group_first_block_no(sb, block_group);
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+               flex_bg = 1;
 
-               if (EXT4_HAS_INCOMPAT_FEATURE(sb,
-                                             EXT4_FEATURE_INCOMPAT_FLEX_BG))
-                       flex_bg = 1;
+       /* Set bits for block and inode bitmaps, and inode table */
+       tmp = ext4_block_bitmap(sb, gdp);
+       if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+               ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
 
-               /* Set bits for block and inode bitmaps, and inode table */
-               tmp = ext4_block_bitmap(sb, gdp);
-               if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
-                       ext4_set_bit(tmp - start, bh->b_data);
+       tmp = ext4_inode_bitmap(sb, gdp);
+       if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+               ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
 
-               tmp = ext4_inode_bitmap(sb, gdp);
+       tmp = ext4_inode_table(sb, gdp);
+       for (; tmp < ext4_inode_table(sb, gdp) +
+                    sbi->s_itb_per_group; tmp++) {
                if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
-                       ext4_set_bit(tmp - start, bh->b_data);
-
-               tmp = ext4_inode_table(sb, gdp);
-               for (; tmp < ext4_inode_table(sb, gdp) +
-                               sbi->s_itb_per_group; tmp++) {
-                       if (!flex_bg ||
-                               ext4_block_in_group(sb, tmp, block_group))
-                               ext4_set_bit(tmp - start, bh->b_data);
-               }
-               /*
-                * Also if the number of blocks within the group is
-                * less than the blocksize * 8 ( which is the size
-                * of bitmap ), set rest of the block bitmap to 1
-                */
-               ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
-                                    bh->b_data);
+                       ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
        }
-       return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
+
+       /*
+        * Also if the number of blocks within the group is less than
+        * the blocksize * 8 ( which is the size of bitmap ), set rest
+        * of the block bitmap to 1
+        */
+       ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
+                            sb->s_blocksize * 8, bh->b_data);
 }
 
+/* Return the number of free blocks in a block group.  It is used when
+ * the block bitmap is uninitialized, so we can't just count the bits
+ * in the bitmap. */
+unsigned ext4_free_clusters_after_init(struct super_block *sb,
+                                      ext4_group_t block_group,
+                                      struct ext4_group_desc *gdp)
+{
+       return num_clusters_in_group(sb, block_group) - 
+               ext4_num_overhead_clusters(sb, block_group, gdp);
+}
 
 /*
  * The free blocks are managed by bitmaps.  A file system contains several
@@ -362,53 +403,54 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 }
 
 /**
- * ext4_has_free_blocks()
+ * ext4_has_free_clusters()
  * @sbi:       in-core super block structure.
- * @nblocks:   number of needed blocks
+ * @nclusters: number of needed blocks
+ * @flags:     flags from ext4_mb_new_blocks()
  *
- * Check if filesystem has nblocks free & available for allocation.
+ * Check if filesystem has nclusters free & available for allocation.
  * On success return 1, return 0 on failure.
  */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
-                               s64 nblocks, unsigned int flags)
+static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
+                                 s64 nclusters, unsigned int flags)
 {
-       s64 free_blocks, dirty_blocks, root_blocks;
-       struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
-       struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
-
-       free_blocks  = percpu_counter_read_positive(fbc);
-       dirty_blocks = percpu_counter_read_positive(dbc);
-       root_blocks = ext4_r_blocks_count(sbi->s_es);
-
-       if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
-                                               EXT4_FREEBLOCKS_WATERMARK) {
-               free_blocks  = percpu_counter_sum_positive(fbc);
-               dirty_blocks = percpu_counter_sum_positive(dbc);
+       s64 free_clusters, dirty_clusters, root_clusters;
+       struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
+       struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
+
+       free_clusters  = percpu_counter_read_positive(fcc);
+       dirty_clusters = percpu_counter_read_positive(dcc);
+       root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es));
+
+       if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
+                                       EXT4_FREECLUSTERS_WATERMARK) {
+               free_clusters  = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc));
+               dirty_clusters = percpu_counter_sum_positive(dcc);
        }
-       /* Check whether we have space after
-        * accounting for current dirty blocks & root reserved blocks.
+       /* Check whether we have space after accounting for current
+        * dirty clusters & root reserved clusters.
         */
-       if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks))
+       if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
                return 1;
 
-       /* Hm, nope.  Are (enough) root reserved blocks available? */
+       /* Hm, nope.  Are (enough) root reserved clusters available? */
        if (sbi->s_resuid == current_fsuid() ||
            ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
            capable(CAP_SYS_RESOURCE) ||
                (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
 
-               if (free_blocks >= (nblocks + dirty_blocks))
+               if (free_clusters >= (nclusters + dirty_clusters))
                        return 1;
        }
 
        return 0;
 }
 
-int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-                          s64 nblocks, unsigned int flags)
+int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+                            s64 nclusters, unsigned int flags)
 {
-       if (ext4_has_free_blocks(sbi, nblocks, flags)) {
-               percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
+       if (ext4_has_free_clusters(sbi, nclusters, flags)) {
+               percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
                return 0;
        } else
                return -ENOSPC;
@@ -428,7 +470,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-       if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
+       if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
            (*retries)++ > 3 ||
            !EXT4_SB(sb)->s_journal)
                return 0;
@@ -444,7 +486,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
  * @handle:             handle to this transaction
  * @inode:              file inode
  * @goal:               given target block(filesystem wide)
- * @count:             pointer to total number of blocks needed
+ * @count:             pointer to total number of clusters needed
  * @errp:               error code
  *
  * Return 1st allocated block number on success, *count stores total account
@@ -476,18 +518,19 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-               dquot_alloc_block_nofail(inode, ar.len);
+               dquot_alloc_block_nofail(inode,
+                               EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
        }
        return ret;
 }
 
 /**
- * ext4_count_free_blocks() -- count filesystem free blocks
+ * ext4_count_free_clusters() -- count filesystem free clusters
  * @sb:                superblock
  *
- * Adds up the number of free blocks from each block group.
+ * Adds up the number of free clusters from each block group.
  */
-ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
+ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
 {
        ext4_fsblk_t desc_count;
        struct ext4_group_desc *gdp;
@@ -508,7 +551,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-               desc_count += ext4_free_blks_count(sb, gdp);
+               desc_count += ext4_free_group_clusters(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_block_bitmap(sb, i);
                if (bitmap_bh == NULL)
@@ -516,12 +559,13 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 
                x = ext4_count_free(bitmap_bh, sb->s_blocksize);
                printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
-                       i, ext4_free_blks_count(sb, gdp), x);
+                       i, ext4_free_group_clusters(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
-       printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
-               ", computed = %llu, %llu\n", ext4_free_blocks_count(es),
+       printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
+              ", computed = %llu, %llu\n",
+              EXT4_B2C(sbi, ext4_free_blocks_count(es)),
               desc_count, bitmap_count);
        return bitmap_count;
 #else
@@ -530,7 +574,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-               desc_count += ext4_free_blks_count(sb, gdp);
+               desc_count += ext4_free_group_clusters(sb, gdp);
        }
 
        return desc_count;
@@ -620,6 +664,31 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
 
 }
 
+/*
+ * This function returns the number of file system metadata clusters at
+ * the beginning of a block group, including the reserved gdt blocks.
+ */
+unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+                                    ext4_group_t block_group)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       unsigned num;
+
+       /* Check for superblock and gdt backups in this group */
+       num = ext4_bg_has_super(sb, block_group);
+
+       if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+           block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
+                         sbi->s_desc_per_block) {
+               if (num) {
+                       num += ext4_bg_num_gdb(sb, block_group);
+                       num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+               }
+       } else { /* For META_BG_BLOCK_GROUPS */
+               num += ext4_bg_num_gdb(sb, block_group);
+       }
+       return EXT4_NUM_B2C(sbi, num);
+}
 /**
  *     ext4_inode_to_goal_block - return a hint for block allocation
  *     @inode: inode for block allocation
index cec3145..5b0e26a 100644 (file)
@@ -144,9 +144,17 @@ struct ext4_allocation_request {
 #define EXT4_MAP_UNWRITTEN     (1 << BH_Unwritten)
 #define EXT4_MAP_BOUNDARY      (1 << BH_Boundary)
 #define EXT4_MAP_UNINIT                (1 << BH_Uninit)
+/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
+ * ext4_map_blocks wants to know whether or not the underlying cluster has
+ * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
+ * the requested mapping was from previously mapped (or delayed allocated)
+ * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
+ * should never appear on buffer_head's state flags.
+ */
+#define EXT4_MAP_FROM_CLUSTER  (1 << BH_AllocFromCluster)
 #define EXT4_MAP_FLAGS         (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
-                                EXT4_MAP_UNINIT)
+                                EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER)
 
 struct ext4_map_blocks {
        ext4_fsblk_t m_pblk;
@@ -239,8 +247,11 @@ struct ext4_io_submit {
 # define EXT4_BLOCK_SIZE(s)            (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
 #endif
 #define        EXT4_ADDR_PER_BLOCK(s)          (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
+#define EXT4_CLUSTER_SIZE(s)           (EXT4_BLOCK_SIZE(s) << \
+                                        EXT4_SB(s)->s_cluster_bits)
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE_BITS(s)       ((s)->s_blocksize_bits)
+# define EXT4_CLUSTER_BITS(s)          (EXT4_SB(s)->s_cluster_bits)
 #else
 # define EXT4_BLOCK_SIZE_BITS(s)       ((s)->s_log_block_size + 10)
 #endif
@@ -258,6 +269,14 @@ struct ext4_io_submit {
 #endif
 #define EXT4_BLOCK_ALIGN(size, blkbits)                ALIGN((size), (1 << (blkbits)))
 
+/* Translate a block number to a cluster number */
+#define EXT4_B2C(sbi, blk)     ((blk) >> (sbi)->s_cluster_bits)
+/* Translate a cluster number to a block number */
+#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits)
+/* Translate # of blks to # of clusters */
+#define EXT4_NUM_B2C(sbi, blks)        (((blks) + (sbi)->s_cluster_ratio - 1) >> \
+                                (sbi)->s_cluster_bits)
+
 /*
  * Structure of a blocks group descriptor
  */
@@ -289,7 +308,7 @@ struct ext4_group_desc
 
 struct flex_groups {
        atomic_t free_inodes;
-       atomic_t free_blocks;
+       atomic_t free_clusters;
        atomic_t used_dirs;
 };
 
@@ -306,6 +325,7 @@ struct flex_groups {
 #define EXT4_DESC_SIZE(s)              (EXT4_SB(s)->s_desc_size)
 #ifdef __KERNEL__
 # define EXT4_BLOCKS_PER_GROUP(s)      (EXT4_SB(s)->s_blocks_per_group)
+# define EXT4_CLUSTERS_PER_GROUP(s)    (EXT4_SB(s)->s_clusters_per_group)
 # define EXT4_DESC_PER_BLOCK(s)                (EXT4_SB(s)->s_desc_per_block)
 # define EXT4_INODES_PER_GROUP(s)      (EXT4_SB(s)->s_inodes_per_group)
 # define EXT4_DESC_PER_BLOCK_BITS(s)   (EXT4_SB(s)->s_desc_per_block_bits)
@@ -358,8 +378,7 @@ struct flex_groups {
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
-                          EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
-                          EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+                          EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
                           EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
                           EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
 
@@ -520,6 +539,8 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_PUNCH_OUT_EXT          0x0020
        /* Don't normalize allocation size (used for fallocate) */
 #define EXT4_GET_BLOCKS_NO_NORMALIZE           0x0040
+       /* Request will not result in inode size update (user for fallocate) */
+#define EXT4_GET_BLOCKS_KEEP_SIZE              0x0080
 
 /*
  * Flags used by ext4_free_blocks
@@ -528,6 +549,13 @@ struct ext4_new_group_data {
 #define EXT4_FREE_BLOCKS_FORGET                0x0002
 #define EXT4_FREE_BLOCKS_VALIDATED     0x0004
 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE        0x0008
+#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER  0x0010
+#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER   0x0020
+
+/*
+ * Flags used by ext4_discard_partial_page_buffers
+ */
+#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED  0x0001
 
 /*
  * ioctl commands
@@ -538,9 +566,6 @@ struct ext4_new_group_data {
 #define        EXT4_IOC_SETVERSION             _IOW('f', 4, long)
 #define        EXT4_IOC_GETVERSION_OLD         FS_IOC_GETVERSION
 #define        EXT4_IOC_SETVERSION_OLD         FS_IOC_SETVERSION
-#ifdef CONFIG_JBD2_DEBUG
-#define EXT4_IOC_WAIT_FOR_READONLY     _IOR('f', 99, long)
-#endif
 #define EXT4_IOC_GETRSVSZ              _IOR('f', 5, long)
 #define EXT4_IOC_SETRSVSZ              _IOW('f', 6, long)
 #define EXT4_IOC_GROUP_EXTEND          _IOW('f', 7, unsigned long)
@@ -563,9 +588,6 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_SETRSVSZ            _IOW('f', 6, int)
 #define EXT4_IOC32_GROUP_EXTEND                _IOW('f', 7, unsigned int)
 #define EXT4_IOC32_GROUP_ADD           _IOW('f', 8, struct compat_ext4_new_group_input)
-#ifdef CONFIG_JBD2_DEBUG
-#define EXT4_IOC32_WAIT_FOR_READONLY   _IOR('f', 99, int)
-#endif
 #define EXT4_IOC32_GETVERSION_OLD      FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD      FS_IOC32_SETVERSION
 #endif
@@ -837,6 +859,7 @@ struct ext4_inode_info {
        ext4_group_t    i_last_alloc_group;
 
        /* allocation reservation info for delalloc */
+       /* In case of bigalloc, these refer to clusters rather than blocks */
        unsigned int i_reserved_data_blocks;
        unsigned int i_reserved_meta_blocks;
        unsigned int i_allocated_meta_blocks;
@@ -886,7 +909,6 @@ struct ext4_inode_info {
 /*
  * Mount flags
  */
-#define EXT4_MOUNT_OLDALLOC            0x00002  /* Don't use the new Orlov allocator */
 #define EXT4_MOUNT_GRPID               0x00004 /* Create files with directory's group */
 #define EXT4_MOUNT_DEBUG               0x00008 /* Some debugging messages */
 #define EXT4_MOUNT_ERRORS_CONT         0x00010 /* Continue on errors */
@@ -918,6 +940,9 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DISCARD             0x40000000 /* Issue DISCARD requests */
 #define EXT4_MOUNT_INIT_INODE_TABLE    0x80000000 /* Initialize uninitialized itables */
 
+#define EXT4_MOUNT2_EXPLICIT_DELALLOC  0x00000001 /* User explicitly
+                                                     specified delalloc */
+
 #define clear_opt(sb, opt)             EXT4_SB(sb)->s_mount_opt &= \
                                                ~EXT4_MOUNT_##opt
 #define set_opt(sb, opt)               EXT4_SB(sb)->s_mount_opt |= \
@@ -968,9 +993,9 @@ struct ext4_super_block {
 /*10*/ __le32  s_free_inodes_count;    /* Free inodes count */
        __le32  s_first_data_block;     /* First Data Block */
        __le32  s_log_block_size;       /* Block size */
-       __le32  s_obso_log_frag_size;   /* Obsoleted fragment size */
+       __le32  s_log_cluster_size;     /* Allocation cluster size */
 /*20*/ __le32  s_blocks_per_group;     /* # Blocks per group */
-       __le32  s_obso_frags_per_group; /* Obsoleted fragments per group */
+       __le32  s_clusters_per_group;   /* # Clusters per group */
        __le32  s_inodes_per_group;     /* # Inodes per group */
        __le32  s_mtime;                /* Mount time */
 /*30*/ __le32  s_wtime;                /* Write time */
@@ -1066,7 +1091,10 @@ struct ext4_super_block {
        __u8    s_last_error_func[32];  /* function where the error happened */
 #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
        __u8    s_mount_opts[64];
-       __le32  s_reserved[112];        /* Padding to the end of the block */
+       __le32  s_usr_quota_inum;       /* inode for tracking user quota */
+       __le32  s_grp_quota_inum;       /* inode for tracking group quota */
+       __le32  s_overhead_clusters;    /* overhead blocks/clusters in fs */
+       __le32  s_reserved[109];        /* Padding to the end of the block */
 };
 
 #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
@@ -1086,6 +1114,7 @@ struct ext4_sb_info {
        unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
        unsigned long s_inodes_per_block;/* Number of inodes per block */
        unsigned long s_blocks_per_group;/* Number of blocks in a group */
+       unsigned long s_clusters_per_group; /* Number of clusters in a group */
        unsigned long s_inodes_per_group;/* Number of inodes in a group */
        unsigned long s_itb_per_group;  /* Number of inode table blocks per group */
        unsigned long s_gdb_count;      /* Number of group descriptor blocks */
@@ -1094,6 +1123,8 @@ struct ext4_sb_info {
        ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
        unsigned long s_overhead_last;  /* Last calculated overhead */
        unsigned long s_blocks_last;    /* Last seen block count */
+       unsigned int s_cluster_ratio;   /* Number of blocks per cluster */
+       unsigned int s_cluster_bits;    /* log2 of s_cluster_ratio */
        loff_t s_bitmap_maxbytes;       /* max bytes for bitmap files */
        struct buffer_head * s_sbh;     /* Buffer containing the super block */
        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
@@ -1117,10 +1148,10 @@ struct ext4_sb_info {
        u32 s_hash_seed[4];
        int s_def_hash_version;
        int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
-       struct percpu_counter s_freeblocks_counter;
+       struct percpu_counter s_freeclusters_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
-       struct percpu_counter s_dirtyblocks_counter;
+       struct percpu_counter s_dirtyclusters_counter;
        struct blockgroup_lock *s_blockgroup_lock;
        struct proc_dir_entry *s_proc;
        struct kobject s_kobj;
@@ -1136,10 +1167,6 @@ struct ext4_sb_info {
        u32 s_max_batch_time;
        u32 s_min_batch_time;
        struct block_device *journal_bdev;
-#ifdef CONFIG_JBD2_DEBUG
-       struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
-       wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
-#endif
 #ifdef CONFIG_QUOTA
        char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
        int s_jquota_fmt;                       /* Format of quota to use */
@@ -1248,6 +1275,15 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
                 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
 
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+                                             struct ext4_io_end *io_end)
+{
+       if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+               io_end->flag |= EXT4_IO_END_UNWRITTEN;
+               atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+       }
+}
+
 /*
  * Inode dynamic state flags
  */
@@ -1360,6 +1396,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK       0x0020
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE     0x0040
 #define EXT4_FEATURE_RO_COMPAT_QUOTA           0x0100
+#define EXT4_FEATURE_RO_COMPAT_BIGALLOC                0x0200
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION      0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE         0x0002
@@ -1402,7 +1439,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
                                         EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
                                         EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
-                                        EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
+                                        EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
+                                        EXT4_FEATURE_RO_COMPAT_BIGALLOC)
 
 /*
  * Default values for user and/or group using reserved blocks
@@ -1735,9 +1773,9 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                                         unsigned int flags,
                                         unsigned long *count,
                                         int *errp);
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-                                 s64 nblocks, unsigned int flags);
-extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
+extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+                                   s64 nclusters, unsigned int flags);
+extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
@@ -1745,12 +1783,18 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                      ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
-                                      struct buffer_head *bh,
-                                      ext4_group_t group,
-                                      struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc)                   \
-               ext4_init_block_bitmap(sb, NULL, group, desc)
+extern void ext4_init_block_bitmap(struct super_block *sb,
+                                  struct buffer_head *bh,
+                                  ext4_group_t group,
+                                  struct ext4_group_desc *desc);
+extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
+                                             ext4_group_t block_group,
+                                             struct ext4_group_desc *gdp);
+extern unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+                                           ext4_group_t block_group);
+extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
+                                          ext4_group_t block_group,
+                                          struct ext4_group_desc *gdp);
 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 
 /* dir.c */
@@ -1776,7 +1820,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct
 
 /* ialloc.c */
 extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
-                                   const struct qstr *qstr, __u32 goal);
+                                   const struct qstr *qstr, __u32 goal,
+                                   uid_t *owner);
 extern void ext4_free_inode(handle_t *, struct inode *);
 extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
@@ -1839,6 +1884,12 @@ extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
 extern int ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_discard_partial_page_buffers(handle_t *handle,
+               struct address_space *mapping, loff_t from,
+               loff_t length, int flags);
+extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+               struct inode *inode, struct page *page, loff_t from,
+               loff_t length, int flags);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1927,8 +1978,8 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
 extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                                     struct ext4_group_desc *bg);
-extern __u32 ext4_free_blks_count(struct super_block *sb,
-                               struct ext4_group_desc *bg);
+extern __u32 ext4_free_group_clusters(struct super_block *sb,
+                                     struct ext4_group_desc *bg);
 extern __u32 ext4_free_inodes_count(struct super_block *sb,
                                 struct ext4_group_desc *bg);
 extern __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -1941,8 +1992,9 @@ extern void ext4_inode_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_table_set(struct super_block *sb,
                                 struct ext4_group_desc *bg, ext4_fsblk_t blk);
-extern void ext4_free_blks_set(struct super_block *sb,
-                              struct ext4_group_desc *bg, __u32 count);
+extern void ext4_free_group_clusters_set(struct super_block *sb,
+                                        struct ext4_group_desc *bg,
+                                        __u32 count);
 extern void ext4_free_inodes_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
 extern void ext4_used_dirs_set(struct super_block *sb,
@@ -2051,13 +2103,13 @@ do {                                                            \
 } while (0)
 
 #ifdef CONFIG_SMP
-/* Each CPU can accumulate percpu_counter_batch blocks in their local
- * counters. So we need to make sure we have free blocks more
+/* Each CPU can accumulate percpu_counter_batch clusters in their local
+ * counters. So we need to make sure we have free clusters more
  * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
  */
-#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
+#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
 #else
-#define EXT4_FREEBLOCKS_WATERMARK 0
+#define EXT4_FREECLUSTERS_WATERMARK 0
 #endif
 
 static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
@@ -2243,10 +2295,19 @@ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 enum ext4_state_bits {
        BH_Uninit       /* blocks are allocated but uninitialized on disk */
          = BH_JBDPrivateStart,
+       BH_AllocFromCluster,    /* allocated blocks were part of already
+                                * allocated cluster. Note that this flag will
+                                * never, ever appear in a buffer_head's state
+                                * flag. See EXT4_MAP_FROM_CLUSTER to see where
+                                * this is used. */
+       BH_Da_Mapped,   /* Delayed allocated block that now has a mapping. This
+                        * flag is set when ext4_map_blocks is called on a
+                        * delayed allocated block to get its real mapping. */
 };
 
 BUFFER_FNS(Uninit, uninit)
 TAS_BUFFER_FNS(Uninit, uninit)
+BUFFER_FNS(Da_Mapped, da_mapped)
 
 /*
  * Add new method to test wether block and inode bitmaps are properly
@@ -2282,4 +2343,6 @@ extern void ext4_resize_end(struct super_block *sb);
 
 #endif /* __KERNEL__ */
 
+#include "ext4_extents.h"
+
 #endif /* _EXT4_H */
index 095c36f..a52db3a 100644 (file)
@@ -290,5 +290,7 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
                                                        struct ext4_ext_path *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+                                     int search_hint_reverse);
 #endif /* _EXT4_EXTENTS */
 
index f5240aa..aca1790 100644 (file)
@@ -109,9 +109,11 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_dirty_metadata(handle, bh);
-               if (err)
-                       ext4_journal_abort_handle(where, line, __func__,
-                                                 bh, handle, err);
+               if (err) {
+                       /* Errors can only happen if there is a bug */
+                       handle->h_err = err;
+                       __ext4_journal_stop(where, line, handle);
+               }
        } else {
                if (inode)
                        mark_buffer_dirty_inode(bh, inode);
index 57cf568..61fa9e1 100644 (file)
@@ -42,7 +42,6 @@
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
 
 #include <trace/events/ext4.h>
 
@@ -96,13 +95,17 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
  *  - ENOMEM
  *  - EIO
  */
-static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
-                               struct ext4_ext_path *path)
+#define ext4_ext_dirty(handle, inode, path) \
+               __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+static int __ext4_ext_dirty(const char *where, unsigned int line,
+                           handle_t *handle, struct inode *inode,
+                           struct ext4_ext_path *path)
 {
        int err;
        if (path->p_bh) {
                /* path points to block */
-               err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
+               err = __ext4_handle_dirty_metadata(where, line, handle,
+                                                  inode, path->p_bh);
        } else {
                /* path points to leaf/index in inode body */
                err = ext4_mark_inode_dirty(handle, inode);
@@ -114,11 +117,9 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                              struct ext4_ext_path *path,
                              ext4_lblk_t block)
 {
-       int depth;
-
        if (path) {
+               int depth = path->p_depth;
                struct ext4_extent *ex;
-               depth = path->p_depth;
 
                /*
                 * Try to predict block placement assuming that we are
@@ -180,12 +181,10 @@ static inline int ext4_ext_space_block(struct inode *inode, int check)
 
        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent);
-       if (!check) {
 #ifdef AGGRESSIVE_TEST
-               if (size > 6)
-                       size = 6;
+       if (!check && size > 6)
+               size = 6;
 #endif
-       }
        return size;
 }
 
@@ -195,12 +194,10 @@ static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 
        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent_idx);
-       if (!check) {
 #ifdef AGGRESSIVE_TEST
-               if (size > 5)
-                       size = 5;
+       if (!check && size > 5)
+               size = 5;
 #endif
-       }
        return size;
 }
 
@@ -211,12 +208,10 @@ static inline int ext4_ext_space_root(struct inode *inode, int check)
        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent);
-       if (!check) {
 #ifdef AGGRESSIVE_TEST
-               if (size > 3)
-                       size = 3;
+       if (!check && size > 3)
+               size = 3;
 #endif
-       }
        return size;
 }
 
@@ -227,12 +222,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent_idx);
-       if (!check) {
 #ifdef AGGRESSIVE_TEST
-               if (size > 4)
-                       size = 4;
+       if (!check && size > 4)
+               size = 4;
 #endif
-       }
        return size;
 }
 
@@ -244,7 +237,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
-       int idxs, num = 0;
+       int idxs;
 
        idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                / sizeof(struct ext4_extent_idx));
@@ -259,6 +252,8 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
         */
        if (ei->i_da_metadata_calc_len &&
            ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+               int num = 0;
+
                if ((ei->i_da_metadata_calc_len % idxs) == 0)
                        num++;
                if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
@@ -321,8 +316,6 @@ static int ext4_valid_extent_entries(struct inode *inode,
                                struct ext4_extent_header *eh,
                                int depth)
 {
-       struct ext4_extent *ext;
-       struct ext4_extent_idx *ext_idx;
        unsigned short entries;
        if (eh->eh_entries == 0)
                return 1;
@@ -331,7 +324,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
 
        if (depth == 0) {
                /* leaf entries */
-               ext = EXT_FIRST_EXTENT(eh);
+               struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
                while (entries) {
                        if (!ext4_valid_extent(inode, ext))
                                return 0;
@@ -339,7 +332,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
                        entries--;
                }
        } else {
-               ext_idx = EXT_FIRST_INDEX(eh);
+               struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
                while (entries) {
                        if (!ext4_valid_extent_idx(inode, ext_idx))
                                return 0;
@@ -751,31 +744,30 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
                return -EIO;
        }
 
-       len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
                /* insert after */
-               if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
-                       len = (len - 1) * sizeof(struct ext4_extent_idx);
-                       len = len < 0 ? 0 : len;
-                       ext_debug("insert new index %d after: %llu. "
-                                       "move %d from 0x%p to 0x%p\n",
-                                       logical, ptr, len,
-                                       (curp->p_idx + 1), (curp->p_idx + 2));
-                       memmove(curp->p_idx + 2, curp->p_idx + 1, len);
-               }
+               ext_debug("insert new index %d after: %llu\n", logical, ptr);
                ix = curp->p_idx + 1;
        } else {
                /* insert before */
-               len = len * sizeof(struct ext4_extent_idx);
-               len = len < 0 ? 0 : len;
-               ext_debug("insert new index %d before: %llu. "
-                               "move %d from 0x%p to 0x%p\n",
-                               logical, ptr, len,
-                               curp->p_idx, (curp->p_idx + 1));
-               memmove(curp->p_idx + 1, curp->p_idx, len);
+               ext_debug("insert new index %d before: %llu\n", logical, ptr);
                ix = curp->p_idx;
        }
 
+       len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
+       BUG_ON(len < 0);
+       if (len > 0) {
+               ext_debug("insert new index %d: "
+                               "move %d indices from 0x%p to 0x%p\n",
+                               logical, len, ix, ix + 1);
+               memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
+       }
+
+       if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+               EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+               return -EIO;
+       }
+
        ix->ei_block = cpu_to_le32(logical);
        ext4_idx_store_pblock(ix, ptr);
        le16_add_cpu(&curp->p_hdr->eh_entries, 1);
@@ -1042,16 +1034,14 @@ cleanup:
  */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
                                 unsigned int flags,
-                                struct ext4_ext_path *path,
                                 struct ext4_extent *newext)
 {
-       struct ext4_ext_path *curp = path;
        struct ext4_extent_header *neh;
        struct buffer_head *bh;
        ext4_fsblk_t newblock;
        int err = 0;
 
-       newblock = ext4_ext_new_meta_block(handle, inode, path,
+       newblock = ext4_ext_new_meta_block(handle, inode, NULL,
                newext, &err, flags);
        if (newblock == 0)
                return err;
@@ -1071,7 +1061,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        }
 
        /* move top-level index/leaf into new block */
-       memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
+       memmove(bh->b_data, EXT4_I(inode)->i_data,
+               sizeof(EXT4_I(inode)->i_data));
 
        /* set size of new block */
        neh = ext_block_hdr(bh);
@@ -1089,32 +1080,23 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        if (err)
                goto out;
 
-       /* create index in new top-level index: num,max,pointer */
-       err = ext4_ext_get_access(handle, inode, curp);
-       if (err)
-               goto out;
-
-       curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
-       curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
-       curp->p_hdr->eh_entries = cpu_to_le16(1);
-       curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
-
-       if (path[0].p_hdr->eh_depth)
-               curp->p_idx->ei_block =
-                       EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
-       else
-               curp->p_idx->ei_block =
-                       EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
-       ext4_idx_store_pblock(curp->p_idx, newblock);
-
+       /* Update top-level index: num,max,pointer */
        neh = ext_inode_hdr(inode);
+       neh->eh_entries = cpu_to_le16(1);
+       ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
+       if (neh->eh_depth == 0) {
+               /* Root extent block becomes index block */
+               neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
+               EXT_FIRST_INDEX(neh)->ei_block =
+                       EXT_FIRST_EXTENT(neh)->ee_block;
+       }
        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
 
-       neh->eh_depth = cpu_to_le16(path->p_depth + 1);
-       err = ext4_ext_dirty(handle, inode, curp);
+       neh->eh_depth = cpu_to_le16(neh->eh_depth + 1);
+       ext4_mark_inode_dirty(handle, inode);
 out:
        brelse(bh);
 
@@ -1162,8 +1144,7 @@ repeat:
                        err = PTR_ERR(path);
        } else {
                /* tree is full, time to grow in depth */
-               err = ext4_ext_grow_indepth(handle, inode, flags,
-                                           path, newext);
+               err = ext4_ext_grow_indepth(handle, inode, flags, newext);
                if (err)
                        goto out;
 
@@ -1235,9 +1216,9 @@ static int ext4_ext_search_left(struct inode *inode,
                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
                                EXT4_ERROR_INODE(inode,
                                  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
-                                 ix != NULL ? ix->ei_block : 0,
+                                 ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
                                  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
-                                   EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
+               le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
                                  depth);
                                return -EIO;
                        }
@@ -1260,13 +1241,14 @@ static int ext4_ext_search_left(struct inode *inode,
 /*
  * search the closest allocated block to the right for *logical
  * and returns it at @logical + it's physical address at @phys
- * if *logical is the smallest allocated block, the function
+ * if *logical is the largest allocated block, the function
  * returns 0 at @phys
  * return value contains 0 (success) or error code
  */
 static int ext4_ext_search_right(struct inode *inode,
                                 struct ext4_ext_path *path,
-                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
+                                ext4_lblk_t *logical, ext4_fsblk_t *phys,
+                                struct ext4_extent **ret_ex)
 {
        struct buffer_head *bh = NULL;
        struct ext4_extent_header *eh;
@@ -1308,9 +1290,7 @@ static int ext4_ext_search_right(struct inode *inode,
                                return -EIO;
                        }
                }
-               *logical = le32_to_cpu(ex->ee_block);
-               *phys = ext4_ext_pblock(ex);
-               return 0;
+               goto found_extent;
        }
 
        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
@@ -1323,9 +1303,7 @@ static int ext4_ext_search_right(struct inode *inode,
        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
                /* next allocated block in this leaf */
                ex++;
-               *logical = le32_to_cpu(ex->ee_block);
-               *phys = ext4_ext_pblock(ex);
-               return 0;
+               goto found_extent;
        }
 
        /* go up and search for index to the right */
@@ -1368,9 +1346,12 @@ got_index:
                return -EIO;
        }
        ex = EXT_FIRST_EXTENT(eh);
+found_extent:
        *logical = le32_to_cpu(ex->ee_block);
        *phys = ext4_ext_pblock(ex);
-       put_bh(bh);
+       *ret_ex = ex;
+       if (bh)
+               put_bh(bh);
        return 0;
 }
 
@@ -1395,7 +1376,8 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
        while (depth >= 0) {
                if (depth == path->p_depth) {
                        /* leaf */
-                       if (path[depth].p_ext !=
+                       if (path[depth].p_ext &&
+                               path[depth].p_ext !=
                                        EXT_LAST_EXTENT(path[depth].p_hdr))
                          return le32_to_cpu(path[depth].p_ext[1].ee_block);
                } else {
@@ -1623,7 +1605,8 @@ static int ext4_ext_try_to_merge(struct inode *inode,
  * such that there will be no overlap, and then returns 1.
  * If there is no overlap found, it returns 0.
  */
-static unsigned int ext4_ext_check_overlap(struct inode *inode,
+static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
+                                          struct inode *inode,
                                           struct ext4_extent *newext,
                                           struct ext4_ext_path *path)
 {
@@ -1637,6 +1620,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
        if (!path[depth].p_ext)
                goto out;
        b2 = le32_to_cpu(path[depth].p_ext->ee_block);
+       b2 &= ~(sbi->s_cluster_ratio - 1);
 
        /*
         * get the next allocated block if the extent in the path
@@ -1646,6 +1630,7 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode,
                b2 = ext4_ext_next_allocated_block(path);
                if (b2 == EXT_MAX_BLOCKS)
                        goto out;
+               b2 &= ~(sbi->s_cluster_ratio - 1);
        }
 
        /* check for wrap through zero on extent logical start block*/
@@ -1697,7 +1682,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        /* try to insert block into found extent and return */
        if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
                && ext4_can_extents_be_merged(inode, ex, newext)) {
-               ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+               ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
                          ext4_ext_is_uninitialized(newext),
                          ext4_ext_get_actual_len(newext),
                          le32_to_cpu(ex->ee_block),
@@ -1735,7 +1720,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
                next = ext4_ext_next_leaf_block(path);
        if (next != EXT_MAX_BLOCKS) {
-               ext_debug("next leaf block - %d\n", next);
+               ext_debug("next leaf block - %u\n", next);
                BUG_ON(npath != NULL);
                npath = ext4_ext_find_extent(inode, next, NULL);
                if (IS_ERR(npath))
@@ -1773,46 +1758,51 @@ has_space:
 
        if (!nearex) {
                /* there is no extent in this leaf, create first one */
-               ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
+               ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
                                ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext));
-               path[depth].p_ext = EXT_FIRST_EXTENT(eh);
-       } else if (le32_to_cpu(newext->ee_block)
+               nearex = EXT_FIRST_EXTENT(eh);
+       } else {
+               if (le32_to_cpu(newext->ee_block)
                           > le32_to_cpu(nearex->ee_block)) {
-/*             BUG_ON(newext->ee_block == nearex->ee_block); */
-               if (nearex != EXT_LAST_EXTENT(eh)) {
-                       len = EXT_MAX_EXTENT(eh) - nearex;
-                       len = (len - 1) * sizeof(struct ext4_extent);
-                       len = len < 0 ? 0 : len;
-                       ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
-                                       "move %d from 0x%p to 0x%p\n",
+                       /* Insert after */
+                       ext_debug("insert %u:%llu:[%d]%d before: "
+                                       "nearest %p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_uninitialized(newext),
                                        ext4_ext_get_actual_len(newext),
-                                       nearex, len, nearex + 1, nearex + 2);
-                       memmove(nearex + 2, nearex + 1, len);
+                                       nearex);
+                       nearex++;
+               } else {
+                       /* Insert before */
+                       BUG_ON(newext->ee_block == nearex->ee_block);
+                       ext_debug("insert %u:%llu:[%d]%d after: "
+                                       "nearest %p\n",
+                                       le32_to_cpu(newext->ee_block),
+                                       ext4_ext_pblock(newext),
+                                       ext4_ext_is_uninitialized(newext),
+                                       ext4_ext_get_actual_len(newext),
+                                       nearex);
+               }
+               len = EXT_LAST_EXTENT(eh) - nearex + 1;
+               if (len > 0) {
+                       ext_debug("insert %u:%llu:[%d]%d: "
+                                       "move %d extents from 0x%p to 0x%p\n",
+                                       le32_to_cpu(newext->ee_block),
+                                       ext4_ext_pblock(newext),
+                                       ext4_ext_is_uninitialized(newext),
+                                       ext4_ext_get_actual_len(newext),
+                                       len, nearex, nearex + 1);
+                       memmove(nearex + 1, nearex,
+                               len * sizeof(struct ext4_extent));
                }
-               path[depth].p_ext = nearex + 1;
-       } else {
-               BUG_ON(newext->ee_block == nearex->ee_block);
-               len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
-               len = len < 0 ? 0 : len;
-               ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
-                               "move %d from 0x%p to 0x%p\n",
-                               le32_to_cpu(newext->ee_block),
-                               ext4_ext_pblock(newext),
-                               ext4_ext_is_uninitialized(newext),
-                               ext4_ext_get_actual_len(newext),
-                               nearex, len, nearex, nearex + 1);
-               memmove(nearex + 1, nearex, len);
-               path[depth].p_ext = nearex;
        }
 
        le16_add_cpu(&eh->eh_entries, 1);
-       nearex = path[depth].p_ext;
+       path[depth].p_ext = nearex;
        nearex->ee_block = newext->ee_block;
        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
        nearex->ee_len = newext->ee_len;
@@ -1962,6 +1952,7 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
        struct ext4_ext_cache *cex;
        BUG_ON(len == 0);
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+       trace_ext4_ext_put_in_cache(inode, block, len, start);
        cex = &EXT4_I(inode)->i_cached_extent;
        cex->ec_block = block;
        cex->ec_len = len;
@@ -2063,6 +2054,7 @@ errout:
                sbi->extent_cache_misses++;
        else
                sbi->extent_cache_hits++;
+       trace_ext4_ext_in_cache(inode, block, ret);
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        return ret;
 }
@@ -2130,6 +2122,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        if (err)
                return err;
        ext_debug("index is empty, remove it, free block %llu\n", leaf);
+       trace_ext4_ext_rm_idx(inode, leaf);
+
        ext4_free_blocks(handle, inode, NULL, leaf, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
        return err;
@@ -2158,7 +2152,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
                         *  need to account for leaf block credit
                         *
                         *  bitmaps and block group descriptor blocks
-                        *  and other metadat blocks still need to be
+                        *  and other metadata blocks still need to be
                         *  accounted.
                         */
                        /* 1 bitmap, 1 block group descriptor */
@@ -2195,14 +2189,40 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 }
 
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
-                               struct ext4_extent *ex,
-                               ext4_lblk_t from, ext4_lblk_t to)
+                             struct ext4_extent *ex,
+                             ext4_fsblk_t *partial_cluster,
+                             ext4_lblk_t from, ext4_lblk_t to)
 {
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
+       ext4_fsblk_t pblk;
        int flags = EXT4_FREE_BLOCKS_FORGET;
 
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
+       /*
+        * For bigalloc file systems, we never free a partial cluster
+        * at the beginning of the extent.  Instead, we make a note
+        * that we tried freeing the cluster, and check to see if we
+        * need to free it on a subsequent call to ext4_remove_blocks,
+        * or at the end of the ext4_truncate() operation.
+        */
+       flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+
+       trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
+       /*
+        * If we have a partial cluster, and it's different from the
+        * cluster of the last block, we need to explicitly free the
+        * partial cluster here.
+        */
+       pblk = ext4_ext_pblock(ex) + ee_len - 1;
+       if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+               ext4_free_blocks(handle, inode, NULL,
+                                EXT4_C2B(sbi, *partial_cluster),
+                                sbi->s_cluster_ratio, flags);
+               *partial_cluster = 0;
+       }
+
 #ifdef EXTENTS_STATS
        {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2222,12 +2242,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
            && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
                /* tail removal */
                ext4_lblk_t num;
-               ext4_fsblk_t start;
 
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
-               start = ext4_ext_pblock(ex) + ee_len - num;
-               ext_debug("free last %u blocks starting %llu\n", num, start);
-               ext4_free_blocks(handle, inode, NULL, start, num, flags);
+               pblk = ext4_ext_pblock(ex) + ee_len - num;
+               ext_debug("free last %u blocks starting %llu\n", num, pblk);
+               ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
+               /*
+                * If the block range to be freed didn't start at the
+                * beginning of a cluster, and we removed the entire
+                * extent, save the partial cluster here, since we
+                * might need to delete if we determine that the
+                * truncate operation has removed all of the blocks in
+                * the cluster.
+                */
+               if (pblk & (sbi->s_cluster_ratio - 1) &&
+                   (ee_len == num))
+                       *partial_cluster = EXT4_B2C(sbi, pblk);
+               else
+                       *partial_cluster = 0;
        } else if (from == le32_to_cpu(ex->ee_block)
                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
                /* head removal */
@@ -2238,7 +2270,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                start = ext4_ext_pblock(ex);
 
                ext_debug("free first %u blocks starting %llu\n", num, start);
-               ext4_free_blocks(handle, inode, 0, start, num, flags);
+               ext4_free_blocks(handle, inode, NULL, start, num, flags);
 
        } else {
                printk(KERN_INFO "strange request: removal(2) "
@@ -2262,19 +2294,19 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
  */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-               struct ext4_ext_path *path, ext4_lblk_t start,
-               ext4_lblk_t end)
+                struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+                ext4_lblk_t start, ext4_lblk_t end)
 {
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int err = 0, correct_index = 0;
        int depth = ext_depth(inode), credits;
        struct ext4_extent_header *eh;
-       ext4_lblk_t a, b, block;
+       ext4_lblk_t a, b;
        unsigned num;
        ext4_lblk_t ex_ee_block;
        unsigned short ex_ee_len;
        unsigned uninitialized = 0;
        struct ext4_extent *ex;
-       struct ext4_map_blocks map;
 
        /* the header must be checked already in ext4_ext_remove_space() */
        ext_debug("truncate since %u in leaf\n", start);
@@ -2291,6 +2323,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        ex_ee_block = le32_to_cpu(ex->ee_block);
        ex_ee_len = ext4_ext_get_actual_len(ex);
 
+       trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
+
        while (ex >= EXT_FIRST_EXTENT(eh) &&
                        ex_ee_block + ex_ee_len > start) {
 
@@ -2315,86 +2349,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        ex_ee_block = le32_to_cpu(ex->ee_block);
                        ex_ee_len = ext4_ext_get_actual_len(ex);
                        continue;
-               } else if (a != ex_ee_block &&
-                       b != ex_ee_block + ex_ee_len - 1) {
-                       /*
-                        * If this is a truncate, then this condition should
-                        * never happen because at least one of the end points
-                        * needs to be on the edge of the extent.
-                        */
-                       if (end == EXT_MAX_BLOCKS - 1) {
-                               ext_debug("  bad truncate %u:%u\n",
-                                               start, end);
-                               block = 0;
-                               num = 0;
-                               err = -EIO;
-                               goto out;
-                       }
-                       /*
-                        * else this is a hole punch, so the extent needs to
-                        * be split since neither edge of the hole is on the
-                        * extent edge
-                        */
-                       else{
-                               map.m_pblk = ext4_ext_pblock(ex);
-                               map.m_lblk = ex_ee_block;
-                               map.m_len = b - ex_ee_block;
-
-                               err = ext4_split_extent(handle,
-                                       inode, path, &map, 0,
-                                       EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
-                                       EXT4_GET_BLOCKS_PRE_IO);
-
-                               if (err < 0)
-                                       goto out;
-
-                               ex_ee_len = ext4_ext_get_actual_len(ex);
-
-                               b = ex_ee_block+ex_ee_len - 1 < end ?
-                                       ex_ee_block+ex_ee_len - 1 : end;
-
-                               /* Then remove tail of this extent */
-                               block = ex_ee_block;
-                               num = a - block;
-                       }
+               } else if (b != ex_ee_block + ex_ee_len - 1) {
+                       EXT4_ERROR_INODE(inode,"  bad truncate %u:%u\n",
+                                        start, end);
+                       err = -EIO;
+                       goto out;
                } else if (a != ex_ee_block) {
                        /* remove tail of the extent */
-                       block = ex_ee_block;
-                       num = a - block;
-               } else if (b != ex_ee_block + ex_ee_len - 1) {
-                       /* remove head of the extent */
-                       block = b;
-                       num =  ex_ee_block + ex_ee_len - b;
-
-                       /*
-                        * If this is a truncate, this condition
-                        * should never happen
-                        */
-                       if (end == EXT_MAX_BLOCKS - 1) {
-                               ext_debug("  bad truncate %u:%u\n",
-                                       start, end);
-                               err = -EIO;
-                               goto out;
-                       }
+                       num = a - ex_ee_block;
                } else {
                        /* remove whole extent: excellent! */
-                       block = ex_ee_block;
                        num = 0;
-                       if (a != ex_ee_block) {
-                               ext_debug("  bad truncate %u:%u\n",
-                                       start, end);
-                               err = -EIO;
-                               goto out;
-                       }
-
-                       if (b != ex_ee_block + ex_ee_len - 1) {
-                               ext_debug("  bad truncate %u:%u\n",
-                                       start, end);
-                               err = -EIO;
-                               goto out;
-                       }
                }
-
                /*
                 * 3 for leaf, sb, and inode plus 2 (bmap and group
                 * descriptor) for each block group; assume two block
@@ -2416,23 +2382,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                if (err)
                        goto out;
 
-               err = ext4_remove_blocks(handle, inode, ex, a, b);
+               err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
+                                        a, b);
                if (err)
                        goto out;
 
-               if (num == 0) {
+               if (num == 0)
                        /* this extent is removed; mark slot entirely unused */
                        ext4_ext_store_pblock(ex, 0);
-               } else if (block != ex_ee_block) {
-                       /*
-                        * If this was a head removal, then we need to update
-                        * the physical block since it is now at a different
-                        * location
-                        */
-                       ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
-               }
 
-               ex->ee_block = cpu_to_le32(block);
                ex->ee_len = cpu_to_le16(num);
                /*
                 * Do not mark uninitialized if all the blocks in the
@@ -2440,11 +2398,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                 */
                if (uninitialized && num)
                        ext4_ext_mark_uninitialized(ex);
-
-               err = ext4_ext_dirty(handle, inode, path + depth);
-               if (err)
-                       goto out;
-
                /*
                 * If the extent was completely released,
                 * we need to remove it from the leaf
@@ -2464,9 +2417,14 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                                        sizeof(struct ext4_extent));
                        }
                        le16_add_cpu(&eh->eh_entries, -1);
-               }
+               } else
+                       *partial_cluster = 0;
 
-               ext_debug("new extent: %u:%u:%llu\n", block, num,
+               err = ext4_ext_dirty(handle, inode, path + depth);
+               if (err)
+                       goto out;
+
+               ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
                                ext4_ext_pblock(ex));
                ex--;
                ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2476,6 +2434,25 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        if (correct_index && eh->eh_entries)
                err = ext4_ext_correct_indexes(handle, inode, path);
 
+       /*
+        * If there is still a entry in the leaf node, check to see if
+        * it references the partial cluster.  This is the only place
+        * where it could; if it doesn't, we can free the cluster.
+        */
+       if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+           (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
+            *partial_cluster)) {
+               int flags = EXT4_FREE_BLOCKS_FORGET;
+
+               if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                       flags |= EXT4_FREE_BLOCKS_METADATA;
+
+               ext4_free_blocks(handle, inode, NULL,
+                                EXT4_C2B(sbi, *partial_cluster),
+                                sbi->s_cluster_ratio, flags);
+               *partial_cluster = 0;
+       }
+
        /* if this leaf is free, then we should
         * remove it from index block above */
        if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
@@ -2511,6 +2488,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
        struct super_block *sb = inode->i_sb;
        int depth = ext_depth(inode);
        struct ext4_ext_path *path;
+       ext4_fsblk_t partial_cluster = 0;
        handle_t *handle;
        int i, err;
 
@@ -2524,6 +2502,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 again:
        ext4_ext_invalidate_cache(inode);
 
+       trace_ext4_ext_remove_space(inode, start, depth);
+
        /*
         * We start scanning from right side, freeing all the blocks
         * after i_size and walking into the tree depth-wise.
@@ -2546,7 +2526,8 @@ again:
                if (i == depth) {
                        /* this is leaf block */
                        err = ext4_ext_rm_leaf(handle, inode, path,
-                                       start, EXT_MAX_BLOCKS - 1);
+                                              &partial_cluster, start,
+                                              EXT_MAX_BLOCKS - 1);
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
@@ -2618,6 +2599,24 @@ again:
                }
        }
 
+       trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
+                       path->p_hdr->eh_entries);
+
+       /* If we still have something in the partial cluster and we have removed
+        * even the first extent, then we should free the blocks in the partial
+        * cluster as well. */
+       if (partial_cluster && path->p_hdr->eh_entries == 0) {
+               int flags = EXT4_FREE_BLOCKS_FORGET;
+
+               if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                       flags |= EXT4_FREE_BLOCKS_METADATA;
+
+               ext4_free_blocks(handle, inode, NULL,
+                                EXT4_C2B(EXT4_SB(sb), partial_cluster),
+                                EXT4_SB(sb)->s_cluster_ratio, flags);
+               partial_cluster = 0;
+       }
+
        /* TODO: flexible tree reduction should be here */
        if (path->p_hdr->eh_entries == 0) {
                /*
@@ -2909,17 +2908,29 @@ out:
  *   a> There is no split required: Entire extent should be initialized
  *   b> Splits in two extents: Write is happening at either end of the extent
  *   c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ *  - The extent pointed to by 'path' is uninitialized.
+ *  - The extent pointed to by 'path' contains a superset
+ *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ *  - the returned value is the number of blocks beyond map->l_lblk
+ *    that are allocated and initialized.
+ *    It is guaranteed to be >= map->m_len.
  */
 static int ext4_ext_convert_to_initialized(handle_t *handle,
                                           struct inode *inode,
                                           struct ext4_map_blocks *map,
                                           struct ext4_ext_path *path)
 {
+       struct ext4_extent_header *eh;
        struct ext4_map_blocks split_map;
        struct ext4_extent zero_ex;
        struct ext4_extent *ex;
        ext4_lblk_t ee_block, eof_block;
-       unsigned int allocated, ee_len, depth;
+       unsigned int ee_len, depth;
+       int allocated;
        int err = 0;
        int split_flag = 0;
 
@@ -2933,11 +2944,93 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                eof_block = map->m_lblk + map->m_len;
 
        depth = ext_depth(inode);
+       eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
 
+       trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+
+       /* Pre-conditions */
+       BUG_ON(!ext4_ext_is_uninitialized(ex));
+       BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+       BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
+
+       /*
+        * Attempt to transfer newly initialized blocks from the currently
+        * uninitialized extent to its left neighbor. This is much cheaper
+        * than an insertion followed by a merge as those involve costly
+        * memmove() calls. This is the common case in steady state for
+        * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
+        * writes.
+        *
+        * Limitations of the current logic:
+        *  - L1: we only deal with writes at the start of the extent.
+        *    The approach could be extended to writes at the end
+        *    of the extent but this scenario was deemed less common.
+        *  - L2: we do not deal with writes covering the whole extent.
+        *    This would require removing the extent if the transfer
+        *    is possible.
+        *  - L3: we only attempt to merge with an extent stored in the
+        *    same extent tree node.
+        */
+       if ((map->m_lblk == ee_block) &&        /*L1*/
+               (map->m_len < ee_len) &&        /*L2*/
+               (ex > EXT_FIRST_EXTENT(eh))) {  /*L3*/
+               struct ext4_extent *prev_ex;
+               ext4_lblk_t prev_lblk;
+               ext4_fsblk_t prev_pblk, ee_pblk;
+               unsigned int prev_len, write_len;
+
+               prev_ex = ex - 1;
+               prev_lblk = le32_to_cpu(prev_ex->ee_block);
+               prev_len = ext4_ext_get_actual_len(prev_ex);
+               prev_pblk = ext4_ext_pblock(prev_ex);
+               ee_pblk = ext4_ext_pblock(ex);
+               write_len = map->m_len;
+
+               /*
+                * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+                * upon those conditions:
+                * - C1: prev_ex is initialized,
+                * - C2: prev_ex is logically abutting ex,
+                * - C3: prev_ex is physically abutting ex,
+                * - C4: prev_ex can receive the additional blocks without
+                *   overflowing the (initialized) length limit.
+                */
+               if ((!ext4_ext_is_uninitialized(prev_ex)) &&            /*C1*/
+                       ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
+                       ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
+                       (prev_len < (EXT_INIT_MAX_LEN - write_len))) {  /*C4*/
+                       err = ext4_ext_get_access(handle, inode, path + depth);
+                       if (err)
+                               goto out;
+
+                       trace_ext4_ext_convert_to_initialized_fastpath(inode,
+                               map, ex, prev_ex);
+
+                       /* Shift the start of ex by 'write_len' blocks */
+                       ex->ee_block = cpu_to_le32(ee_block + write_len);
+                       ext4_ext_store_pblock(ex, ee_pblk + write_len);
+                       ex->ee_len = cpu_to_le16(ee_len - write_len);
+                       ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+                       /* Extend prev_ex by 'write_len' blocks */
+                       prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+
+                       /* Mark the block containing both extents as dirty */
+                       ext4_ext_dirty(handle, inode, path + depth);
+
+                       /* Update path to point to the right extent */
+                       path[depth].p_ext = prev_ex;
+
+                       /* Result: number of initialized blocks past m_lblk */
+                       allocated = write_len;
+                       goto out;
+               }
+       }
+
        WARN_ON(map->m_lblk < ee_block);
        /*
         * It is safe to convert extent to initialized via explicit
@@ -3165,6 +3258,192 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
        return ext4_mark_inode_dirty(handle, inode);
 }
 
+/**
+ * ext4_find_delalloc_range: find delayed allocated block in the given range.
+ *
+ * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
+ * whether there are any buffers marked for delayed allocation. It returns '1'
+ * on the first delalloc'ed buffer head found. If no buffer head in the given
+ * range is marked for delalloc, it returns 0.
+ * lblk_start should always be <= lblk_end.
+ * search_hint_reverse is to indicate that searching in reverse from lblk_end to
+ * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
+ * block sooner). This is useful when blocks are truncated sequentially from
+ * lblk_start towards lblk_end.
+ */
+static int ext4_find_delalloc_range(struct inode *inode,
+                                   ext4_lblk_t lblk_start,
+                                   ext4_lblk_t lblk_end,
+                                   int search_hint_reverse)
+{
+       struct address_space *mapping = inode->i_mapping;
+       struct buffer_head *head, *bh = NULL;
+       struct page *page;
+       ext4_lblk_t i, pg_lblk;
+       pgoff_t index;
+
+       /* reverse search wont work if fs block size is less than page size */
+       if (inode->i_blkbits < PAGE_CACHE_SHIFT)
+               search_hint_reverse = 0;
+
+       if (search_hint_reverse)
+               i = lblk_end;
+       else
+               i = lblk_start;
+
+       index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+       while ((i >= lblk_start) && (i <= lblk_end)) {
+               page = find_get_page(mapping, index);
+               if (!page)
+                       goto nextpage;
+
+               if (!page_has_buffers(page))
+                       goto nextpage;
+
+               head = page_buffers(page);
+               if (!head)
+                       goto nextpage;
+
+               bh = head;
+               pg_lblk = index << (PAGE_CACHE_SHIFT -
+                                               inode->i_blkbits);
+               do {
+                       if (unlikely(pg_lblk < lblk_start)) {
+                               /*
+                                * This is possible when fs block size is less
+                                * than page size and our cluster starts/ends in
+                                * middle of the page. So we need to skip the
+                                * initial few blocks till we reach the 'lblk'
+                                */
+                               pg_lblk++;
+                               continue;
+                       }
+
+                       /* Check if the buffer is delayed allocated and that it
+                        * is not yet mapped. (when da-buffers are mapped during
+                        * their writeout, their da_mapped bit is set.)
+                        */
+                       if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
+                               page_cache_release(page);
+                               trace_ext4_find_delalloc_range(inode,
+                                               lblk_start, lblk_end,
+                                               search_hint_reverse,
+                                               1, i);
+                               return 1;
+                       }
+                       if (search_hint_reverse)
+                               i--;
+                       else
+                               i++;
+               } while ((i >= lblk_start) && (i <= lblk_end) &&
+                               ((bh = bh->b_this_page) != head));
+nextpage:
+               if (page)
+                       page_cache_release(page);
+               /*
+                * Move to next page. 'i' will be the first lblk in the next
+                * page.
+                */
+               if (search_hint_reverse)
+                       index--;
+               else
+                       index++;
+               i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+       }
+
+       trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+                                       search_hint_reverse, 0, 0);
+       return 0;
+}
+
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+                              int search_hint_reverse)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       ext4_lblk_t lblk_start, lblk_end;
+       lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
+       lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+
+       return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+                                       search_hint_reverse);
+}
+
+/**
+ * Determines how many complete clusters (out of those specified by the 'map')
+ * are under delalloc and were reserved quota for.
+ * This function is called when we are writing out the blocks that were
+ * originally written with their allocation delayed, but then the space was
+ * allocated using fallocate() before the delayed allocation could be resolved.
+ * The cases to look for are:
+ * ('=' indicated delayed allocated blocks
+ *  '-' indicates non-delayed allocated blocks)
+ * (a) partial clusters towards beginning and/or end outside of allocated range
+ *     are not delalloc'ed.
+ *     Ex:
+ *     |----c---=|====c====|====c====|===-c----|
+ *              |++++++ allocated ++++++|
+ *     ==> 4 complete clusters in above example
+ *
+ * (b) partial cluster (outside of allocated range) towards either end is
+ *     marked for delayed allocation. In this case, we will exclude that
+ *     cluster.
+ *     Ex:
+ *     |----====c========|========c========|
+ *          |++++++ allocated ++++++|
+ *     ==> 1 complete clusters in above example
+ *
+ *     Ex:
+ *     |================c================|
+ *            |++++++ allocated ++++++|
+ *     ==> 0 complete clusters in above example
+ *
+ * The ext4_da_update_reserve_space will be called only if we
+ * determine here that there were some "entire" clusters that span
+ * this 'allocated' range.
+ * In the non-bigalloc case, this function will just end up returning num_blks
+ * without ever calling ext4_find_delalloc_range.
+ */
+static unsigned int
+get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
+                          unsigned int num_blks)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
+       ext4_lblk_t lblk_from, lblk_to, c_offset;
+       unsigned int allocated_clusters = 0;
+
+       alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
+       alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
+
+       /* max possible clusters for this allocation */
+       allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
+
+       trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
+
+       /* Check towards left side */
+       c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
+       if (c_offset) {
+               lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
+               lblk_to = lblk_from + c_offset - 1;
+
+               if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+                       allocated_clusters--;
+       }
+
+       /* Now check towards right. */
+       c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
+       if (allocated_clusters && c_offset) {
+               lblk_from = lblk_start + num_blks;
+               lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
+
+               if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+                       allocated_clusters--;
+       }
+
+       return allocated_clusters;
+}
+
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
@@ -3181,6 +3460,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                  flags, allocated);
        ext4_ext_show_leaf(inode, path);
 
+       trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
+                                                   newblock);
+
        /* get_block() before submit the IO, split the extent */
        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                ret = ext4_split_unwritten_extents(handle, inode, map,
@@ -3190,10 +3472,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * that this IO needs to conversion to written when IO is
                 * completed
                 */
-               if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
-                       io->flag = EXT4_IO_END_UNWRITTEN;
-                       atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-               } else
+               if (io)
+                       ext4_set_io_unwritten_flag(inode, io);
+               else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
                        map->m_flags |= EXT4_MAP_UNINIT;
@@ -3234,14 +3515,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
        /* buffered write, writepage time, convert*/
        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-       if (ret >= 0) {
+       if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
-               err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
-                                        map->m_len);
-               if (err < 0)
-                       goto out2;
-       }
-
 out:
        if (ret <= 0) {
                err = ret;
@@ -3270,11 +3545,24 @@ out:
         * But fallocate would have already updated quota and block
         * count for this offset. So cancel these reservation
         */
-       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-               ext4_da_update_reserve_space(inode, allocated, 0);
+       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+               unsigned int reserved_clusters;
+               reserved_clusters = get_reserved_cluster_alloc(inode,
+                               map->m_lblk, map->m_len);
+               if (reserved_clusters)
+                       ext4_da_update_reserve_space(inode,
+                                                    reserved_clusters,
+                                                    0);
+       }
 
 map_out:
        map->m_flags |= EXT4_MAP_MAPPED;
+       if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
+               err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+                                        map->m_len);
+               if (err < 0)
+                       goto out2;
+       }
 out1:
        if (allocated > map->m_len)
                allocated = map->m_len;
@@ -3289,6 +3577,111 @@ out2:
        return err ? err : allocated;
 }
 
+/*
+ * get_implied_cluster_alloc - check to see if the requested
+ * allocation (in the map structure) overlaps with a cluster already
+ * allocated in an extent.
+ *     @sb     The filesystem superblock structure
+ *     @map    The requested lblk->pblk mapping
+ *     @ex     The extent structure which might contain an implied
+ *                     cluster allocation
+ *
+ * This function is called by ext4_ext_map_blocks() after we failed to
+ * find blocks that were already in the inode's extent tree.  Hence,
+ * we know that the beginning of the requested region cannot overlap
+ * the extent from the inode's extent tree.  There are three cases we
+ * want to catch.  The first is this case:
+ *
+ *              |--- cluster # N--|
+ *    |--- extent ---| |---- requested region ---|
+ *                     |==========|
+ *
+ * The second case that we need to test for is this one:
+ *
+ *   |--------- cluster # N ----------------|
+ *        |--- requested region --|   |------- extent ----|
+ *        |=======================|
+ *
+ * The third case is when the requested region lies between two extents
+ * within the same cluster:
+ *          |------------- cluster # N-------------|
+ * |----- ex -----|                  |---- ex_right ----|
+ *                  |------ requested region ------|
+ *                  |================|
+ *
+ * In each of the above cases, we need to set the map->m_pblk and
+ * map->m_len so it corresponds to the return the extent labelled as
+ * "|====|" from cluster #N, since it is already in use for data in
+ * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to
+ * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
+ * as a new "allocated" block region.  Otherwise, we will return 0 and
+ * ext4_ext_map_blocks() will then allocate one or more new clusters
+ * by calling ext4_mb_new_blocks().
+ */
+static int get_implied_cluster_alloc(struct super_block *sb,
+                                    struct ext4_map_blocks *map,
+                                    struct ext4_extent *ex,
+                                    struct ext4_ext_path *path)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+       ext4_lblk_t ex_cluster_start, ex_cluster_end;
+       ext4_lblk_t rr_cluster_start, rr_cluster_end;
+       ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
+       ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
+       unsigned short ee_len = ext4_ext_get_actual_len(ex);
+
+       /* The extent passed in that we are trying to match */
+       ex_cluster_start = EXT4_B2C(sbi, ee_block);
+       ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
+
+       /* The requested region passed into ext4_map_blocks() */
+       rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
+       rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
+
+       if ((rr_cluster_start == ex_cluster_end) ||
+           (rr_cluster_start == ex_cluster_start)) {
+               if (rr_cluster_start == ex_cluster_end)
+                       ee_start += ee_len - 1;
+               map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
+                       c_offset;
+               map->m_len = min(map->m_len,
+                                (unsigned) sbi->s_cluster_ratio - c_offset);
+               /*
+                * Check for and handle this case:
+                *
+                *   |--------- cluster # N-------------|
+                *                     |------- extent ----|
+                *         |--- requested region ---|
+                *         |===========|
+                */
+
+               if (map->m_lblk < ee_block)
+                       map->m_len = min(map->m_len, ee_block - map->m_lblk);
+
+               /*
+                * Check for the case where there is already another allocated
+                * block to the right of 'ex' but before the end of the cluster.
+                *
+                *          |------------- cluster # N-------------|
+                * |----- ex -----|                  |---- ex_right ----|
+                *                  |------ requested region ------|
+                *                  |================|
+                */
+               if (map->m_lblk > ee_block) {
+                       ext4_lblk_t next = ext4_ext_next_allocated_block(path);
+                       map->m_len = min(map->m_len, next - map->m_lblk);
+               }
+
+               trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
+               return 1;
+       }
+
+       trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
+       return 0;
+}
+
+
 /*
  * Block allocation/map/preallocation routine for extents based files
  *
@@ -3311,15 +3704,17 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map, int flags)
 {
        struct ext4_ext_path *path = NULL;
-       struct ext4_extent newex, *ex;
+       struct ext4_extent newex, *ex, *ex2;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_fsblk_t newblock = 0;
-       int err = 0, depth, ret;
-       unsigned int allocated = 0;
+       int free_on_err = 0, err = 0, depth, ret;
+       unsigned int allocated = 0, offset = 0;
+       unsigned int allocated_clusters = 0;
        unsigned int punched_out = 0;
        unsigned int result = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
-       struct ext4_map_blocks punch_map;
+       ext4_lblk_t cluster_offset;
 
        ext_debug("blocks %u/%u requested for inode %lu\n",
                  map->m_lblk, map->m_len, inode->i_ino);
@@ -3329,6 +3724,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
                ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
                if (!newex.ee_start_lo && !newex.ee_start_hi) {
+                       if ((sbi->s_cluster_ratio > 1) &&
+                           ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+                               map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
                                 * block isn't allocated yet and
@@ -3339,6 +3738,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        /* we should allocate requested block */
                } else {
                        /* block is already allocated */
+                       if (sbi->s_cluster_ratio > 1)
+                               map->m_flags |= EXT4_MAP_FROM_CLUSTER;
                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
                                   + ext4_ext_pblock(&newex);
@@ -3384,8 +3785,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                 * we split out initialized portions during a write.
                 */
                ee_len = ext4_ext_get_actual_len(ex);
+
+               trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
+
                /* if found extent covers block, simply return it */
                if (in_range(map->m_lblk, ee_block, ee_len)) {
+                       struct ext4_map_blocks punch_map;
+                       ext4_fsblk_t partial_cluster = 0;
+
                        newblock = map->m_lblk - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
                        allocated = ee_len - (map->m_lblk - ee_block);
@@ -3469,7 +3876,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        ext4_ext_invalidate_cache(inode);
 
                        err = ext4_ext_rm_leaf(handle, inode, path,
-                               map->m_lblk, map->m_lblk + punched_out);
+                                              &partial_cluster, map->m_lblk,
+                                              map->m_lblk + punched_out);
 
                        if (!err && path->p_hdr->eh_entries == 0) {
                                /*
@@ -3492,6 +3900,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                }
        }
 
+       if ((sbi->s_cluster_ratio > 1) &&
+           ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+               map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+
        /*
         * requested block isn't allocated yet;
         * we couldn't try to create block if create flag is zero
@@ -3504,9 +3916,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
                goto out2;
        }
+
        /*
         * Okay, we need to do block allocation.
         */
+       map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+       newex.ee_block = cpu_to_le32(map->m_lblk);
+       cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+
+       /*
+        * If we are doing bigalloc, check to see if the extent returned
+        * by ext4_ext_find_extent() implies a cluster we can use.
+        */
+       if (cluster_offset && ex &&
+           get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
+               ar.len = allocated = map->m_len;
+               newblock = map->m_pblk;
+               map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+               goto got_allocated_blocks;
+       }
 
        /* find neighbour allocated blocks */
        ar.lleft = map->m_lblk;
@@ -3514,10 +3942,21 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        if (err)
                goto out2;
        ar.lright = map->m_lblk;
-       err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
+       ex2 = NULL;
+       err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
        if (err)
                goto out2;
 
+       /* Check if the extent after searching to the right implies a
+        * cluster we can use. */
+       if ((sbi->s_cluster_ratio > 1) && ex2 &&
+           get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
+               ar.len = allocated = map->m_len;
+               newblock = map->m_pblk;
+               map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+               goto got_allocated_blocks;
+       }
+
        /*
         * See if request is beyond maximum number of blocks we can have in
         * a single extent. For an initialized extent this limit is
@@ -3532,9 +3971,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                map->m_len = EXT_UNINIT_MAX_LEN;
 
        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
-       newex.ee_block = cpu_to_le32(map->m_lblk);
        newex.ee_len = cpu_to_le16(map->m_len);
-       err = ext4_ext_check_overlap(inode, &newex, path);
+       err = ext4_ext_check_overlap(sbi, inode, &newex, path);
        if (err)
                allocated = ext4_ext_get_actual_len(&newex);
        else
@@ -3544,7 +3982,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        ar.inode = inode;
        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
        ar.logical = map->m_lblk;
-       ar.len = allocated;
+       /*
+        * We calculate the offset from the beginning of the cluster
+        * for the logical block number, since when we allocate a
+        * physical cluster, the physical block should start at the
+        * same offset from the beginning of the cluster.  This is
+        * needed so that future calls to get_implied_cluster_alloc()
+        * work correctly.
+        */
+       offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
+       ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
+       ar.goal -= offset;
+       ar.logical -= offset;
        if (S_ISREG(inode->i_mode))
                ar.flags = EXT4_MB_HINT_DATA;
        else
@@ -3557,9 +4006,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                goto out2;
        ext_debug("allocate new block: goal %llu, found %llu/%u\n",
                  ar.goal, newblock, allocated);
+       free_on_err = 1;
+       allocated_clusters = ar.len;
+       ar.len = EXT4_C2B(sbi, ar.len) - offset;
+       if (ar.len > allocated)
+               ar.len = allocated;
 
+got_allocated_blocks:
        /* try to insert new extent into found leaf and return */
-       ext4_ext_store_pblock(&newex, newblock);
+       ext4_ext_store_pblock(&newex, newblock + offset);
        newex.ee_len = cpu_to_le16(ar.len);
        /* Mark uninitialized */
        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
@@ -3572,10 +4027,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                 * that we need to perform conversion when IO is done.
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                       if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
-                               io->flag = EXT4_IO_END_UNWRITTEN;
-                               atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-                       } else
+                       if (io)
+                               ext4_set_io_unwritten_flag(inode, io);
+                       else
                                ext4_set_inode_state(inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
                }
@@ -3583,11 +4037,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
 
-       err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
+       err = 0;
+       if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
+               err = check_eofblocks_fl(handle, inode, map->m_lblk,
+                                        path, ar.len);
        if (!err)
                err = ext4_ext_insert_extent(handle, inode, path,
                                             &newex, flags);
-       if (err) {
+       if (err && free_on_err) {
                int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
                        EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
                /* free data blocks we just allocated */
@@ -3610,8 +4067,82 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         * Update reserved blocks/metadata blocks after successful
         * block allocation which had been deferred till now.
         */
-       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-               ext4_da_update_reserve_space(inode, allocated, 1);
+       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+               unsigned int reserved_clusters;
+               /*
+                * Check how many clusters we had reserved this allocated range
+                */
+               reserved_clusters = get_reserved_cluster_alloc(inode,
+                                               map->m_lblk, allocated);
+               if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
+                       if (reserved_clusters) {
+                               /*
+                                * We have clusters reserved for this range.
+                                * But since we are not doing actual allocation
+                                * and are simply using blocks from previously
+                                * allocated cluster, we should release the
+                                * reservation and not claim quota.
+                                */
+                               ext4_da_update_reserve_space(inode,
+                                               reserved_clusters, 0);
+                       }
+               } else {
+                       BUG_ON(allocated_clusters < reserved_clusters);
+                       /* We will claim quota for all newly allocated blocks.*/
+                       ext4_da_update_reserve_space(inode, allocated_clusters,
+                                                       1);
+                       if (reserved_clusters < allocated_clusters) {
+                               struct ext4_inode_info *ei = EXT4_I(inode);
+                               int reservation = allocated_clusters -
+                                                 reserved_clusters;
+                               /*
+                                * It seems we claimed few clusters outside of
+                                * the range of this allocation. We should give
+                                * it back to the reservation pool. This can
+                                * happen in the following case:
+                                *
+                                * * Suppose s_cluster_ratio is 4 (i.e., each
+                                *   cluster has 4 blocks. Thus, the clusters
+                                *   are [0-3],[4-7],[8-11]...
+                                * * First comes delayed allocation write for
+                                *   logical blocks 10 & 11. Since there were no
+                                *   previous delayed allocated blocks in the
+                                *   range [8-11], we would reserve 1 cluster
+                                *   for this write.
+                                * * Next comes write for logical blocks 3 to 8.
+                                *   In this case, we will reserve 2 clusters
+                                *   (for [0-3] and [4-7]; and not for [8-11] as
+                                *   that range has a delayed allocated blocks.
+                                *   Thus total reserved clusters now becomes 3.
+                                * * Now, during the delayed allocation writeout
+                                *   time, we will first write blocks [3-8] and
+                                *   allocate 3 clusters for writing these
+                                *   blocks. Also, we would claim all these
+                                *   three clusters above.
+                                * * Now when we come here to writeout the
+                                *   blocks [10-11], we would expect to claim
+                                *   the reservation of 1 cluster we had made
+                                *   (and we would claim it since there are no
+                                *   more delayed allocated blocks in the range
+                                *   [8-11]. But our reserved cluster count had
+                                *   already gone to 0.
+                                *
+                                *   Thus, at the step 4 above when we determine
+                                *   that there are still some unwritten delayed
+                                *   allocated blocks outside of our current
+                                *   block range, we should increment the
+                                *   reserved clusters count so that when the
+                                *   remaining blocks finally gets written, we
+                                *   could claim them.
+                                */
+                               dquot_reserve_block(inode,
+                                               EXT4_C2B(sbi, reservation));
+                               spin_lock(&ei->i_block_reservation_lock);
+                               ei->i_reserved_data_blocks += reservation;
+                               spin_unlock(&ei->i_block_reservation_lock);
+                       }
+               }
+       }
 
        /*
         * Cache the extent and update transaction to commit on fdatasync only
@@ -3634,12 +4165,12 @@ out2:
                ext4_ext_drop_refs(path);
                kfree(path);
        }
-       trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
-               newblock, map->m_len, err ? err : allocated);
-
        result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
                        punched_out : allocated;
 
+       trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+               newblock, map->m_len, err ? err : result);
+
        return err ? err : result;
 }
 
@@ -3649,6 +4180,7 @@ void ext4_ext_truncate(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t last_block;
        handle_t *handle;
+       loff_t page_len;
        int err = 0;
 
        /*
@@ -3665,8 +4197,16 @@ void ext4_ext_truncate(struct inode *inode)
        if (IS_ERR(handle))
                return;
 
-       if (inode->i_size & (sb->s_blocksize - 1))
-               ext4_block_truncate_page(handle, mapping, inode->i_size);
+       if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+               page_len = PAGE_CACHE_SIZE -
+                       (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+               err = ext4_discard_partial_page_buffers(handle,
+                       mapping, inode->i_size, page_len, 0);
+
+               if (err)
+                       goto out_stop;
+       }
 
        if (ext4_orphan_add(handle, inode))
                goto out_stop;
@@ -3760,6 +4300,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        int ret = 0;
        int ret2 = 0;
        int retries = 0;
+       int flags;
        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
 
@@ -3796,6 +4337,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
                return ret;
        }
+       flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
+       if (mode & FALLOC_FL_KEEP_SIZE)
+               flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+       /*
+        * Don't normalize the request if it can fit in one extent so
+        * that it doesn't get unnecessarily split into multiple
+        * extents.
+        */
+       if (len <= EXT_UNINIT_MAX_LEN << blkbits)
+               flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 retry:
        while (ret >= 0 && ret < max_blocks) {
                map.m_lblk = map.m_lblk + ret;
@@ -3805,9 +4356,7 @@ retry:
                        ret = PTR_ERR(handle);
                        break;
                }
-               ret = ext4_map_blocks(handle, inode, &map,
-                                     EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
-                                     EXT4_GET_BLOCKS_NO_NORMALIZE);
+               ret = ext4_map_blocks(handle, inode, &map, flags);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
@@ -4102,7 +4651,6 @@ found_delayed_extent:
                return EXT_BREAK;
        return EXT_CONTINUE;
 }
-
 /* fiemap flags we can handle specified here */
 #define EXT4_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 
@@ -4162,17 +4710,28 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
        struct address_space *mapping = inode->i_mapping;
        struct ext4_map_blocks map;
        handle_t *handle;
-       loff_t first_block_offset, last_block_offset, block_len;
-       loff_t first_page, last_page, first_page_offset, last_page_offset;
+       loff_t first_page, last_page, page_len;
+       loff_t first_page_offset, last_page_offset;
        int ret, credits, blocks_released, err = 0;
 
+       /* No need to punch hole beyond i_size */
+       if (offset >= inode->i_size)
+               return 0;
+
+       /*
+        * If the hole extends beyond i_size, set the hole
+        * to end after the page that contains i_size
+        */
+       if (offset + length > inode->i_size) {
+               length = inode->i_size +
+                  PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+                  offset;
+       }
+
        first_block = (offset + sb->s_blocksize - 1) >>
                EXT4_BLOCK_SIZE_BITS(sb);
        last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
 
-       first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
-       last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
-
        first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        last_page = (offset + length) >> PAGE_CACHE_SHIFT;
 
@@ -4185,11 +4744,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
         */
        if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                err = filemap_write_and_wait_range(mapping,
-                       first_page_offset == 0 ? 0 : first_page_offset-1,
-                       last_page_offset);
+                       offset, offset + length - 1);
 
-                       if (err)
-                               return err;
+               if (err)
+                       return err;
        }
 
        /* Now release the pages */
@@ -4211,24 +4769,64 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
                goto out;
 
        /*
-        * Now we need to zero out the un block aligned data.
-        * If the file is smaller than a block, just
-        * zero out the middle
+        * Now we need to zero out the non-page-aligned data in the
+        * pages at the start and tail of the hole, and unmap the buffer
+        * heads for the block aligned regions of the page that were
+        * completely zeroed.
         */
-       if (first_block > last_block)
-               ext4_block_zero_page_range(handle, mapping, offset, length);
-       else {
-               /* zero out the head of the hole before the first block */
-               block_len  = first_block_offset - offset;
-               if (block_len > 0)
-                       ext4_block_zero_page_range(handle, mapping,
-                                                  offset, block_len);
-
-               /* zero out the tail of the hole after the last block */
-               block_len = offset + length - last_block_offset;
-               if (block_len > 0) {
-                       ext4_block_zero_page_range(handle, mapping,
-                                       last_block_offset, block_len);
+       if (first_page > last_page) {
+               /*
+                * If the file space being truncated is contained within a page
+                * just zero out and unmap the middle of that page
+                */
+               err = ext4_discard_partial_page_buffers(handle,
+                       mapping, offset, length, 0);
+
+               if (err)
+                       goto out;
+       } else {
+               /*
+                * zero out and unmap the partial page that contains
+                * the start of the hole
+                */
+               page_len  = first_page_offset - offset;
+               if (page_len > 0) {
+                       err = ext4_discard_partial_page_buffers(handle, mapping,
+                                                  offset, page_len, 0);
+                       if (err)
+                               goto out;
+               }
+
+               /*
+                * zero out and unmap the partial page that contains
+                * the end of the hole
+                */
+               page_len = offset + length - last_page_offset;
+               if (page_len > 0) {
+                       err = ext4_discard_partial_page_buffers(handle, mapping,
+                                       last_page_offset, page_len, 0);
+                       if (err)
+                               goto out;
+               }
+       }
+
+
+       /*
+        * If i_size is contained in the last page, we need to
+        * unmap and zero the partial page after i_size
+        */
+       if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
+          inode->i_size % PAGE_CACHE_SIZE != 0) {
+
+               page_len = PAGE_CACHE_SIZE -
+                       (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+               if (page_len > 0) {
+                       err = ext4_discard_partial_page_buffers(handle,
+                         mapping, inode->i_size, page_len, 0);
+
+                       if (err)
+                               goto out;
                }
        }
 
index b9548f4..cb70f18 100644 (file)
@@ -181,8 +181,8 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
                path.dentry = mnt->mnt_root;
                cp = d_path(&path, buf, sizeof(buf));
                if (!IS_ERR(cp)) {
-                       memcpy(sbi->s_es->s_last_mounted, cp,
-                              sizeof(sbi->s_es->s_last_mounted));
+                       strlcpy(sbi->s_es->s_last_mounted, cp,
+                               sizeof(sbi->s_es->s_last_mounted));
                        ext4_mark_super_dirty(sb);
                }
        }
index 036f78f..00a2cb7 100644 (file)
@@ -75,7 +75,7 @@ static void dump_completed_IO(struct inode * inode)
  * to written.
  * The function return the number of pending IOs on success.
  */
-extern int ext4_flush_completed_IO(struct inode *inode)
+int ext4_flush_completed_IO(struct inode *inode)
 {
        ext4_io_end_t *io;
        struct ext4_inode_info *ei = EXT4_I(inode);
@@ -83,14 +83,12 @@ extern int ext4_flush_completed_IO(struct inode *inode)
        int ret = 0;
        int ret2 = 0;
 
-       if (list_empty(&ei->i_completed_io_list))
-               return ret;
-
        dump_completed_IO(inode);
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        while (!list_empty(&ei->i_completed_io_list)){
                io = list_entry(ei->i_completed_io_list.next,
                                ext4_io_end_t, list);
+               list_del_init(&io->list);
                /*
                 * Calling ext4_end_io_nolock() to convert completed
                 * IO to written.
@@ -107,11 +105,9 @@ extern int ext4_flush_completed_IO(struct inode *inode)
                 */
                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
                ret = ext4_end_io_nolock(io);
-               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
                if (ret < 0)
                        ret2 = ret;
-               else
-                       list_del_init(&io->list);
+               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        }
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
        return (ret2 < 0) ? ret2 : 0;
index 9c63f27..612bec2 100644 (file)
@@ -78,7 +78,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
         * allocation, essentially implementing a per-group read-only flag. */
        if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
                ext4_error(sb, "Checksum bad for group %u", block_group);
-               ext4_free_blks_set(sb, gdp, 0);
+               ext4_free_group_clusters_set(sb, gdp, 0);
                ext4_free_inodes_set(sb, gdp, 0);
                ext4_itable_unused_set(sb, gdp, 0);
                memset(bh->b_data, 0xff, sb->s_blocksize);
@@ -293,121 +293,9 @@ error_return:
        ext4_std_error(sb, fatal);
 }
 
-/*
- * There are two policies for allocating an inode.  If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
- *
- * For other inodes, search forward from the parent directory\'s block
- * group to find a free inode.
- */
-static int find_group_dir(struct super_block *sb, struct inode *parent,
-                               ext4_group_t *best_group)
-{
-       ext4_group_t ngroups = ext4_get_groups_count(sb);
-       unsigned int freei, avefreei;
-       struct ext4_group_desc *desc, *best_desc = NULL;
-       ext4_group_t group;
-       int ret = -1;
-
-       freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
-       avefreei = freei / ngroups;
-
-       for (group = 0; group < ngroups; group++) {
-               desc = ext4_get_group_desc(sb, group, NULL);
-               if (!desc || !ext4_free_inodes_count(sb, desc))
-                       continue;
-               if (ext4_free_inodes_count(sb, desc) < avefreei)
-                       continue;
-               if (!best_desc ||
-                   (ext4_free_blks_count(sb, desc) >
-                    ext4_free_blks_count(sb, best_desc))) {
-                       *best_group = group;
-                       best_desc = desc;
-                       ret = 0;
-               }
-       }
-       return ret;
-}
-
-#define free_block_ratio 10
-
-static int find_group_flex(struct super_block *sb, struct inode *parent,
-                          ext4_group_t *best_group)
-{
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
-       struct ext4_group_desc *desc;
-       struct flex_groups *flex_group = sbi->s_flex_groups;
-       ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
-       ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
-       ext4_group_t ngroups = ext4_get_groups_count(sb);
-       int flex_size = ext4_flex_bg_size(sbi);
-       ext4_group_t best_flex = parent_fbg_group;
-       int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
-       int flexbg_free_blocks;
-       int flex_freeb_ratio;
-       ext4_group_t n_fbg_groups;
-       ext4_group_t i;
-
-       n_fbg_groups = (ngroups + flex_size - 1) >>
-               sbi->s_log_groups_per_flex;
-
-find_close_to_parent:
-       flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
-       flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
-       if (atomic_read(&flex_group[best_flex].free_inodes) &&
-           flex_freeb_ratio > free_block_ratio)
-               goto found_flexbg;
-
-       if (best_flex && best_flex == parent_fbg_group) {
-               best_flex--;
-               goto find_close_to_parent;
-       }
-
-       for (i = 0; i < n_fbg_groups; i++) {
-               if (i == parent_fbg_group || i == parent_fbg_group - 1)
-                       continue;
-
-               flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
-               flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
-
-               if (flex_freeb_ratio > free_block_ratio &&
-                   (atomic_read(&flex_group[i].free_inodes))) {
-                       best_flex = i;
-                       goto found_flexbg;
-               }
-
-               if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
-                   ((atomic_read(&flex_group[i].free_blocks) >
-                     atomic_read(&flex_group[best_flex].free_blocks)) &&
-                    atomic_read(&flex_group[i].free_inodes)))
-                       best_flex = i;
-       }
-
-       if (!atomic_read(&flex_group[best_flex].free_inodes) ||
-           !atomic_read(&flex_group[best_flex].free_blocks))
-               return -1;
-
-found_flexbg:
-       for (i = best_flex * flex_size; i < ngroups &&
-                    i < (best_flex + 1) * flex_size; i++) {
-               desc = ext4_get_group_desc(sb, i, NULL);
-               if (ext4_free_inodes_count(sb, desc)) {
-                       *best_group = i;
-                       goto out;
-               }
-       }
-
-       return -1;
-out:
-       return 0;
-}
-
 struct orlov_stats {
        __u32 free_inodes;
-       __u32 free_blocks;
+       __u32 free_clusters;
        __u32 used_dirs;
 };
 
@@ -424,7 +312,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
 
        if (flex_size > 1) {
                stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
-               stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
+               stats->free_clusters = atomic_read(&flex_group[g].free_clusters);
                stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
                return;
        }
@@ -432,11 +320,11 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
        desc = ext4_get_group_desc(sb, g, NULL);
        if (desc) {
                stats->free_inodes = ext4_free_inodes_count(sb, desc);
-               stats->free_blocks = ext4_free_blks_count(sb, desc);
+               stats->free_clusters = ext4_free_group_clusters(sb, desc);
                stats->used_dirs = ext4_used_dirs_count(sb, desc);
        } else {
                stats->free_inodes = 0;
-               stats->free_blocks = 0;
+               stats->free_clusters = 0;
                stats->used_dirs = 0;
        }
 }
@@ -471,10 +359,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        ext4_group_t real_ngroups = ext4_get_groups_count(sb);
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei;
-       ext4_fsblk_t freeb, avefreeb;
+       ext4_fsblk_t freeb, avefreec;
        unsigned int ndirs;
        int max_dirs, min_inodes;
-       ext4_grpblk_t min_blocks;
+       ext4_grpblk_t min_clusters;
        ext4_group_t i, grp, g, ngroups;
        struct ext4_group_desc *desc;
        struct orlov_stats stats;
@@ -490,9 +378,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 
        freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
        avefreei = freei / ngroups;
-       freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-       avefreeb = freeb;
-       do_div(avefreeb, ngroups);
+       freeb = EXT4_C2B(sbi,
+               percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+       avefreec = freeb;
+       do_div(avefreec, ngroups);
        ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
 
        if (S_ISDIR(mode) &&
@@ -518,7 +407,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
                                continue;
                        if (stats.free_inodes < avefreei)
                                continue;
-                       if (stats.free_blocks < avefreeb)
+                       if (stats.free_clusters < avefreec)
                                continue;
                        grp = g;
                        ret = 0;
@@ -556,7 +445,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        min_inodes = avefreei - inodes_per_group*flex_size / 4;
        if (min_inodes < 1)
                min_inodes = 1;
-       min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+       min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
 
        /*
         * Start looking in the flex group where we last allocated an
@@ -575,7 +464,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
                        continue;
                if (stats.free_inodes < min_inodes)
                        continue;
-               if (stats.free_blocks < min_blocks)
+               if (stats.free_clusters < min_clusters)
                        continue;
                goto found_flex_bg;
        }
@@ -659,7 +548,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
        *group = parent_group;
        desc = ext4_get_group_desc(sb, *group, NULL);
        if (desc && ext4_free_inodes_count(sb, desc) &&
-                       ext4_free_blks_count(sb, desc))
+           ext4_free_group_clusters(sb, desc))
                return 0;
 
        /*
@@ -683,7 +572,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                        *group -= ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
                if (desc && ext4_free_inodes_count(sb, desc) &&
-                               ext4_free_blks_count(sb, desc))
+                   ext4_free_group_clusters(sb, desc))
                        return 0;
        }
 
@@ -802,7 +691,7 @@ err_ret:
  * group to find a free inode.
  */
 struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
-                            const struct qstr *qstr, __u32 goal)
+                            const struct qstr *qstr, __u32 goal, uid_t *owner)
 {
        struct super_block *sb;
        struct buffer_head *inode_bitmap_bh = NULL;
@@ -816,8 +705,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
        int ret2, err = 0;
        struct inode *ret;
        ext4_group_t i;
-       int free = 0;
-       static int once = 1;
        ext4_group_t flex_group;
 
        /* Cannot create files in a deleted directory */
@@ -843,26 +730,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
                goto got_group;
        }
 
-       if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
-               ret2 = find_group_flex(sb, dir, &group);
-               if (ret2 == -1) {
-                       ret2 = find_group_other(sb, dir, &group, mode);
-                       if (ret2 == 0 && once) {
-                               once = 0;
-                               printk(KERN_NOTICE "ext4: find_group_flex "
-                                      "failed, fallback succeeded dir %lu\n",
-                                      dir->i_ino);
-                       }
-               }
-               goto got_group;
-       }
-
-       if (S_ISDIR(mode)) {
-               if (test_opt(sb, OLDALLOC))
-                       ret2 = find_group_dir(sb, dir, &group);
-               else
-                       ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
-       } else
+       if (S_ISDIR(mode))
+               ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
+       else
                ret2 = find_group_other(sb, dir, &group, mode);
 
 got_group:
@@ -950,26 +820,21 @@ got:
                        goto fail;
                }
 
-               free = 0;
-               ext4_lock_group(sb, group);
+               BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+               err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
+               brelse(block_bitmap_bh);
+
                /* recheck and clear flag under lock if we still need to */
+               ext4_lock_group(sb, group);
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                       free = ext4_free_blocks_after_init(sb, group, gdp);
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-                       ext4_free_blks_set(sb, gdp, free);
+                       ext4_free_group_clusters_set(sb, gdp,
+                               ext4_free_clusters_after_init(sb, group, gdp));
                        gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
                                                                gdp);
                }
                ext4_unlock_group(sb, group);
 
-               /* Don't need to dirty bitmap block if we didn't change it */
-               if (free) {
-                       BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
-                       err = ext4_handle_dirty_metadata(handle,
-                                                       NULL, block_bitmap_bh);
-               }
-
-               brelse(block_bitmap_bh);
                if (err)
                        goto fail;
        }
@@ -987,8 +852,11 @@ got:
                flex_group = ext4_flex_group(sbi, group);
                atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
        }
-
-       if (test_opt(sb, GRPID)) {
+       if (owner) {
+               inode->i_mode = mode;
+               inode->i_uid = owner[0];
+               inode->i_gid = owner[1];
+       } else if (test_opt(sb, GRPID)) {
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = dir->i_gid;
@@ -1005,11 +873,7 @@ got:
        ei->i_dir_start_lookup = 0;
        ei->i_disksize = 0;
 
-       /*
-        * Don't inherit extent flag from directory, amongst others. We set
-        * extent flag on newly created directory and file only if -o extent
-        * mount option is specified
-        */
+       /* Don't inherit extent flag from directory, amongst others. */
        ei->i_flags =
                ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
        ei->i_file_acl = 0;
@@ -1235,7 +1099,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
  * inode allocation from the current group, so we take alloc_sem lock, to
  * block ext4_claim_inode until we are finished.
  */
-extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
                                 int barrier)
 {
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
index 0962642..3cfc73f 100644 (file)
@@ -699,6 +699,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
        /*
         * Okay, we need to do block allocation.
        */
+       if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+               EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
+                                "non-extent mapped inodes with bigalloc");
+               return -ENOSPC;
+       }
+
        goal = ext4_find_goal(inode, map->m_lblk, partial);
 
        /* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -1343,7 +1350,9 @@ void ext4_ind_truncate(struct inode *inode)
        __le32 nr = 0;
        int n = 0;
        ext4_lblk_t last_block, max_block;
+       loff_t page_len;
        unsigned blocksize = inode->i_sb->s_blocksize;
+       int err;
 
        handle = start_transaction(inode);
        if (IS_ERR(handle))
@@ -1354,9 +1363,16 @@ void ext4_ind_truncate(struct inode *inode)
        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 
-       if (inode->i_size & (blocksize - 1))
-               if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+       if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+               page_len = PAGE_CACHE_SIZE -
+                       (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+               err = ext4_discard_partial_page_buffers(handle,
+                       mapping, inode->i_size, page_len, 0);
+
+               if (err)
                        goto out_stop;
+       }
 
        if (last_block != max_block) {
                n = ext4_block_to_path(inode, last_block, offsets, NULL);
index 0defe0b..f2419a1 100644 (file)
@@ -42,7 +42,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "ext4_extents.h"
 #include "truncate.h"
 
 #include <trace/events/ext4.h>
@@ -268,7 +267,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
        struct ext4_inode_info *ei = EXT4_I(inode);
 
        spin_lock(&ei->i_block_reservation_lock);
-       trace_ext4_da_update_reserve_space(inode, used);
+       trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
                         "with only %d reserved data blocks\n",
@@ -281,7 +280,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
        /* Update per-inode reservations */
        ei->i_reserved_data_blocks -= used;
        ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
-       percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+       percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                           used + ei->i_allocated_meta_blocks);
        ei->i_allocated_meta_blocks = 0;
 
@@ -291,7 +290,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
                 * only when we have written all of the delayed
                 * allocation blocks.
                 */
-               percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+               percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
@@ -300,14 +299,14 @@ void ext4_da_update_reserve_space(struct inode *inode,
 
        /* Update quota subsystem for data blocks */
        if (quota_claim)
-               dquot_claim_block(inode, used);
+               dquot_claim_block(inode, EXT4_C2B(sbi, used));
        else {
                /*
                 * We did fallocate with an offset that is already delayed
                 * allocated. So on delayed allocated writeback we should
                 * not re-claim the quota for fallocated blocks.
                 */
-               dquot_release_reservation_block(inode, used);
+               dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
        }
 
        /*
@@ -398,6 +397,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
        return num;
 }
 
+/*
+ * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
+ */
+static void set_buffers_da_mapped(struct inode *inode,
+                                  struct ext4_map_blocks *map)
+{
+       struct address_space *mapping = inode->i_mapping;
+       struct pagevec pvec;
+       int i, nr_pages;
+       pgoff_t index, end;
+
+       index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+       end = (map->m_lblk + map->m_len - 1) >>
+               (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+       pagevec_init(&pvec, 0);
+       while (index <= end) {
+               nr_pages = pagevec_lookup(&pvec, mapping, index,
+                                         min(end - index + 1,
+                                             (pgoff_t)PAGEVEC_SIZE));
+               if (nr_pages == 0)
+                       break;
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+                       struct buffer_head *bh, *head;
+
+                       if (unlikely(page->mapping != mapping) ||
+                           !PageDirty(page))
+                               break;
+
+                       if (page_has_buffers(page)) {
+                               bh = head = page_buffers(page);
+                               do {
+                                       set_buffer_da_mapped(bh);
+                                       bh = bh->b_this_page;
+                               } while (bh != head);
+                       }
+                       index++;
+               }
+               pagevec_release(&pvec);
+       }
+}
+
 /*
  * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
@@ -416,7 +458,7 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
  * the buffer head is mapped.
  *
  * It returns 0 if plain look up failed (blocks have not been allocated), in
- * that casem, buffer head is unmapped
+ * that case, buffer head is unmapped
  *
  * It returns the error in case of allocation failure.
  */
@@ -435,9 +477,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         */
        down_read((&EXT4_I(inode)->i_data_sem));
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-               retval = ext4_ext_map_blocks(handle, inode, map, 0);
+               retval = ext4_ext_map_blocks(handle, inode, map, flags &
+                                            EXT4_GET_BLOCKS_KEEP_SIZE);
        } else {
-               retval = ext4_ind_map_blocks(handle, inode, map, 0);
+               retval = ext4_ind_map_blocks(handle, inode, map, flags &
+                                            EXT4_GET_BLOCKS_KEEP_SIZE);
        }
        up_read((&EXT4_I(inode)->i_data_sem));
 
@@ -455,7 +499,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         * Returns if the blocks have already allocated
         *
         * Note that if blocks have been preallocated
-        * ext4_ext_get_block() returns th create = 0
+        * ext4_ext_get_block() returns the create = 0
         * with buffer head unmapped.
         */
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
@@ -517,9 +561,17 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                        (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
-       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 
+               /* If we have successfully mapped the delayed allocated blocks,
+                * set the BH_Da_Mapped bit on them. Its important to do this
+                * under the protection of i_data_sem.
+                */
+               if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+                       set_buffers_da_mapped(inode, map);
+       }
+
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                int ret = check_block_validity(inode, map);
@@ -909,7 +961,11 @@ static int ext4_ordered_write_end(struct file *file,
                        ext4_orphan_add(handle, inode);
                if (ret2 < 0)
                        ret = ret2;
+       } else {
+               unlock_page(page);
+               page_cache_release(page);
        }
+
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
@@ -1037,14 +1093,14 @@ static int ext4_journalled_write_end(struct file *file,
 }
 
 /*
- * Reserve a single block located at lblock
+ * Reserve a single cluster located at lblock
  */
 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
-       unsigned long md_needed;
+       unsigned int md_needed;
        int ret;
 
        /*
@@ -1054,7 +1110,8 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
         */
 repeat:
        spin_lock(&ei->i_block_reservation_lock);
-       md_needed = ext4_calc_metadata_amount(inode, lblock);
+       md_needed = EXT4_NUM_B2C(sbi,
+                                ext4_calc_metadata_amount(inode, lblock));
        trace_ext4_da_reserve_space(inode, md_needed);
        spin_unlock(&ei->i_block_reservation_lock);
 
@@ -1063,15 +1120,15 @@ repeat:
         * us from metadata over-estimation, though we may go over by
         * a small amount in the end.  Here we just reserve for data.
         */
-       ret = dquot_reserve_block(inode, 1);
+       ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
        if (ret)
                return ret;
        /*
         * We do still charge estimated metadata to the sb though;
         * we cannot afford to run out of free blocks.
         */
-       if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
-               dquot_release_reservation_block(inode, 1);
+       if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
+               dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
                        goto repeat;
@@ -1118,19 +1175,21 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
                 * We can release all of the reserved metadata blocks
                 * only when we have written all of the delayed
                 * allocation blocks.
+                * Note that in case of bigalloc, i_reserved_meta_blocks,
+                * i_reserved_data_blocks, etc. refer to number of clusters.
                 */
-               percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+               percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
        }
 
        /* update fs dirty data blocks counter */
-       percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
+       percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
 
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
-       dquot_release_reservation_block(inode, to_free);
+       dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
@@ -1139,6 +1198,9 @@ static void ext4_da_page_release_reservation(struct page *page,
        int to_release = 0;
        struct buffer_head *head, *bh;
        unsigned int curr_off = 0;
+       struct inode *inode = page->mapping->host;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       int num_clusters;
 
        head = page_buffers(page);
        bh = head;
@@ -1148,10 +1210,24 @@ static void ext4_da_page_release_reservation(struct page *page,
                if ((offset <= curr_off) && (buffer_delay(bh))) {
                        to_release++;
                        clear_buffer_delay(bh);
+                       clear_buffer_da_mapped(bh);
                }
                curr_off = next_off;
        } while ((bh = bh->b_this_page) != head);
-       ext4_da_release_space(page->mapping->host, to_release);
+
+       /* If we have released all the blocks belonging to a cluster, then we
+        * need to release the reserved space for that cluster. */
+       num_clusters = EXT4_NUM_B2C(sbi, to_release);
+       while (num_clusters > 0) {
+               ext4_fsblk_t lblk;
+               lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+                       ((num_clusters - 1) << sbi->s_cluster_bits);
+               if (sbi->s_cluster_ratio == 1 ||
+                   !ext4_find_delalloc_cluster(inode, lblk, 1))
+                       ext4_da_release_space(inode, 1);
+
+               num_clusters--;
+       }
 }
 
 /*
@@ -1253,6 +1329,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
                                        }
+                                       if (buffer_da_mapped(bh))
+                                               clear_buffer_da_mapped(bh);
                                        if (buffer_unwritten(bh) ||
                                            buffer_mapped(bh))
                                                BUG_ON(bh->b_blocknr != pblock);
@@ -1346,12 +1424,15 @@ static void ext4_print_free_blocks(struct inode *inode)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        printk(KERN_CRIT "Total free blocks count %lld\n",
-              ext4_count_free_blocks(inode->i_sb));
+              EXT4_C2B(EXT4_SB(inode->i_sb),
+                       ext4_count_free_clusters(inode->i_sb)));
        printk(KERN_CRIT "Free/Dirty block details\n");
        printk(KERN_CRIT "free_blocks=%lld\n",
-              (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
+              (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+               percpu_counter_sum(&sbi->s_freeclusters_counter)));
        printk(KERN_CRIT "dirty_blocks=%lld\n",
-              (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+              (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+               percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
        printk(KERN_CRIT "Block reservation details\n");
        printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
               EXT4_I(inode)->i_reserved_data_blocks);
@@ -1430,8 +1511,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
                if (err == -EAGAIN)
                        goto submit_io;
 
-               if (err == -ENOSPC &&
-                   ext4_count_free_blocks(sb)) {
+               if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
                        mpd->retval = err;
                        goto submit_io;
                }
@@ -1471,13 +1551,15 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 
                for (i = 0; i < map.m_len; i++)
                        unmap_underlying_metadata(bdev, map.m_pblk + i);
-       }
 
-       if (ext4_should_order_data(mpd->inode)) {
-               err = ext4_jbd2_file_inode(handle, mpd->inode);
-               if (err)
-                       /* This only happens if the journal is aborted */
-                       return;
+               if (ext4_should_order_data(mpd->inode)) {
+                       err = ext4_jbd2_file_inode(handle, mpd->inode);
+                       if (err) {
+                               /* Only if the journal is aborted */
+                               mpd->retval = err;
+                               goto submit_io;
+                       }
+               }
        }
 
        /*
@@ -1583,6 +1665,66 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
        return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
 }
 
+/*
+ * This function is grabs code from the very beginning of
+ * ext4_map_blocks, but assumes that the caller is from delayed write
+ * time. This function looks up the requested blocks and sets the
+ * buffer delay bit under the protection of i_data_sem.
+ */
+static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+                             struct ext4_map_blocks *map,
+                             struct buffer_head *bh)
+{
+       int retval;
+       sector_t invalid_block = ~((sector_t) 0xffff);
+
+       if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+               invalid_block = ~0;
+
+       map->m_flags = 0;
+       ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
+                 "logical block %lu\n", inode->i_ino, map->m_len,
+                 (unsigned long) map->m_lblk);
+       /*
+        * Try to see if we can get the block without requesting a new
+        * file system block.
+        */
+       down_read((&EXT4_I(inode)->i_data_sem));
+       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+               retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+       else
+               retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+
+       if (retval == 0) {
+               /*
+                * XXX: __block_prepare_write() unmaps passed block,
+                * is it OK?
+                */
+               /* If the block was allocated from previously allocated cluster,
+                * then we dont need to reserve it again. */
+               if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+                       retval = ext4_da_reserve_space(inode, iblock);
+                       if (retval)
+                               /* not enough space to reserve */
+                               goto out_unlock;
+               }
+
+               /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
+                * and it should not appear on the bh->b_state.
+                */
+               map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+
+               map_bh(bh, inode->i_sb, invalid_block);
+               set_buffer_new(bh);
+               set_buffer_delay(bh);
+       }
+
+out_unlock:
+       up_read((&EXT4_I(inode)->i_data_sem));
+
+       return retval;
+}
+
 /*
  * This is a special get_blocks_t callback which is used by
  * ext4_da_write_begin().  It will either return mapped block or
@@ -1600,10 +1742,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 {
        struct ext4_map_blocks map;
        int ret = 0;
-       sector_t invalid_block = ~((sector_t) 0xffff);
-
-       if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
-               invalid_block = ~0;
 
        BUG_ON(create == 0);
        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
@@ -1616,25 +1754,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
-       ret = ext4_map_blocks(NULL, inode, &map, 0);
-       if (ret < 0)
+       ret = ext4_da_map_blocks(inode, iblock, &map, bh);
+       if (ret <= 0)
                return ret;
-       if (ret == 0) {
-               if (buffer_delay(bh))
-                       return 0; /* Not sure this could or should happen */
-               /*
-                * XXX: __block_write_begin() unmaps passed block, is it OK?
-                */
-               ret = ext4_da_reserve_space(inode, iblock);
-               if (ret)
-                       /* not enough space to reserve */
-                       return ret;
-
-               map_bh(bh, inode->i_sb, invalid_block);
-               set_buffer_new(bh);
-               set_buffer_delay(bh);
-               return 0;
-       }
 
        map_bh(bh, inode->i_sb, map.m_pblk);
        bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
@@ -2050,6 +2172,7 @@ static int ext4_da_writepages(struct address_space *mapping,
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        pgoff_t done_index = 0;
        pgoff_t end;
+       struct blk_plug plug;
 
        trace_ext4_da_writepages(inode, wbc);
 
@@ -2128,6 +2251,7 @@ retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, index, end);
 
+       blk_start_plug(&plug);
        while (!ret && wbc->nr_to_write > 0) {
 
                /*
@@ -2178,11 +2302,12 @@ retry:
                        ret = 0;
                } else if (ret == MPAGE_DA_EXTENT_TAIL) {
                        /*
-                        * got one extent now try with
-                        * rest of the pages
+                        * Got one extent now try with rest of the pages.
+                        * If mpd.retval is set -EIO, journal is aborted.
+                        * So we don't need to write any more.
                         */
                        pages_written += mpd.pages_written;
-                       ret = 0;
+                       ret = mpd.retval;
                        io_done = 1;
                } else if (wbc->nr_to_write)
                        /*
@@ -2192,6 +2317,7 @@ retry:
                         */
                        break;
        }
+       blk_finish_plug(&plug);
        if (!io_done && !cycled) {
                cycled = 1;
                index = 0;
@@ -2230,10 +2356,11 @@ static int ext4_nonda_switch(struct super_block *sb)
         * Delalloc need an accurate free block accounting. So switch
         * to non delalloc when we are near to error range.
         */
-       free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-       dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
+       free_blocks  = EXT4_C2B(sbi,
+               percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+       dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
        if (2 * free_blocks < 3 * dirty_blocks ||
-               free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+               free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
                /*
                 * free block count is less than 150% of dirty blocks
                 * or free blocks is less than watermark
@@ -2259,6 +2386,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
        pgoff_t index;
        struct inode *inode = mapping->host;
        handle_t *handle;
+       loff_t page_len;
 
        index = pos >> PAGE_CACHE_SHIFT;
 
@@ -2305,6 +2433,13 @@ retry:
                 */
                if (pos + len > inode->i_size)
                        ext4_truncate_failed_write(inode);
+       } else {
+               page_len = pos & (PAGE_CACHE_SIZE - 1);
+               if (page_len > 0) {
+                       ret = ext4_discard_partial_page_buffers_no_lock(handle,
+                               inode, page, pos - page_len, page_len,
+                               EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
+               }
        }
 
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2347,6 +2482,7 @@ static int ext4_da_write_end(struct file *file,
        loff_t new_i_size;
        unsigned long start, end;
        int write_mode = (int)(unsigned long)fsdata;
+       loff_t page_len;
 
        if (write_mode == FALL_BACK_TO_NONDELALLOC) {
                if (ext4_should_order_data(inode)) {
@@ -2395,6 +2531,16 @@ static int ext4_da_write_end(struct file *file,
        }
        ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
+
+       page_len = PAGE_CACHE_SIZE -
+                       ((pos + copied - 1) & (PAGE_CACHE_SIZE - 1));
+
+       if (page_len > 0) {
+               ret = ext4_discard_partial_page_buffers_no_lock(handle,
+                       inode, page, pos + copied - 1, page_len,
+                       EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
+       }
+
        copied = ret2;
        if (ret2 < 0)
                ret = ret2;
@@ -2689,10 +2835,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
         * but being more careful is always safe for the future change.
         */
        inode = io_end->inode;
-       if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-               io_end->flag |= EXT4_IO_END_UNWRITTEN;
-               atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-       }
+       ext4_set_io_unwritten_flag(inode, io_end);
 
        /* Add the io_end to per-inode completed io list*/
        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -2858,6 +3001,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;
 
+       /*
+        * If we are doing data journalling we don't support O_DIRECT
+        */
+       if (ext4_should_journal_data(inode))
+               return 0;
+
        trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -2927,6 +3076,7 @@ static const struct address_space_operations ext4_journalled_aops = {
        .bmap                   = ext4_bmap,
        .invalidatepage         = ext4_invalidatepage,
        .releasepage            = ext4_releasepage,
+       .direct_IO              = ext4_direct_IO,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_page      = generic_error_remove_page,
 };
@@ -2963,6 +3113,227 @@ void ext4_set_aops(struct inode *inode)
                inode->i_mapping->a_ops = &ext4_journalled_aops;
 }
 
+
+/*
+ * ext4_discard_partial_page_buffers()
+ * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
+ * This function finds and locks the page containing the offset
+ * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
+ * Calling functions that already have the page locked should call
+ * ext4_discard_partial_page_buffers_no_lock directly.
+ */
+int ext4_discard_partial_page_buffers(handle_t *handle,
+               struct address_space *mapping, loff_t from,
+               loff_t length, int flags)
+{
+       struct inode *inode = mapping->host;
+       struct page *page;
+       int err = 0;
+
+       page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+                                  mapping_gfp_mask(mapping) & ~__GFP_FS);
+       if (!page)
+               return -ENOMEM;
+
+       err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
+               from, length, flags);
+
+       unlock_page(page);
+       page_cache_release(page);
+       return err;
+}
+
+/*
+ * ext4_discard_partial_page_buffers_no_lock()
+ * Zeros a page range of length 'length' starting from offset 'from'.
+ * Buffer heads that correspond to the block aligned regions of the
+ * zeroed range will be unmapped.  Unblock aligned regions
+ * will have the corresponding buffer head mapped if needed so that
+ * that region of the page can be updated with the partial zero out.
+ *
+ * This function assumes that the page has already been  locked.  The
+ * The range to be discarded must be contained with in the given page.
+ * If the specified range exceeds the end of the page it will be shortened
+ * to the end of the page that corresponds to 'from'.  This function is
+ * appropriate for updating a page and it buffer heads to be unmapped and
+ * zeroed for blocks that have been either released, or are going to be
+ * released.
+ *
+ * handle: The journal handle
+ * inode:  The files inode
+ * page:   A locked page that contains the offset "from"
+ * from:   The starting byte offset (from the begining of the file)
+ *         to begin discarding
+ * len:    The length of bytes to discard
+ * flags:  Optional flags that may be used:
+ *
+ *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
+ *         Only zero the regions of the page whose buffer heads
+ *         have already been unmapped.  This flag is appropriate
+ *         for updateing the contents of a page whose blocks may
+ *         have already been released, and we only want to zero
+ *         out the regions that correspond to those released blocks.
+ *
+ * Returns zero on sucess or negative on failure.
+ */
+int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+               struct inode *inode, struct page *page, loff_t from,
+               loff_t length, int flags)
+{
+       ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+       unsigned int offset = from & (PAGE_CACHE_SIZE-1);
+       unsigned int blocksize, max, pos;
+       ext4_lblk_t iblock;
+       struct buffer_head *bh;
+       int err = 0;
+
+       blocksize = inode->i_sb->s_blocksize;
+       max = PAGE_CACHE_SIZE - offset;
+
+       if (index != page->index)
+               return -EINVAL;
+
+       /*
+        * correct length if it does not fall between
+        * 'from' and the end of the page
+        */
+       if (length > max || length < 0)
+               length = max;
+
+       iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+       if (!page_has_buffers(page)) {
+               /*
+                * If the range to be discarded covers a partial block
+                * we need to get the page buffers.  This is because
+                * partial blocks cannot be released and the page needs
+                * to be updated with the contents of the block before
+                * we write the zeros on top of it.
+                */
+               if ((from & (blocksize - 1)) ||
+                   ((from + length) & (blocksize - 1))) {
+                       create_empty_buffers(page, blocksize, 0);
+               } else {
+                       /*
+                        * If there are no partial blocks,
+                        * there is nothing to update,
+                        * so we can return now
+                        */
+                       return 0;
+               }
+       }
+
+       /* Find the buffer that contains "offset" */
+       bh = page_buffers(page);
+       pos = blocksize;
+       while (offset >= pos) {
+               bh = bh->b_this_page;
+               iblock++;
+               pos += blocksize;
+       }
+
+       pos = offset;
+       while (pos < offset + length) {
+               unsigned int end_of_block, range_to_discard;
+
+               err = 0;
+
+               /* The length of space left to zero and unmap */
+               range_to_discard = offset + length - pos;
+
+               /* The length of space until the end of the block */
+               end_of_block = blocksize - (pos & (blocksize-1));
+
+               /*
+                * Do not unmap or zero past end of block
+                * for this buffer head
+                */
+               if (range_to_discard > end_of_block)
+                       range_to_discard = end_of_block;
+
+
+               /*
+                * Skip this buffer head if we are only zeroing unampped
+                * regions of the page
+                */
+               if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
+                       buffer_mapped(bh))
+                               goto next;
+
+               /* If the range is block aligned, unmap */
+               if (range_to_discard == blocksize) {
+                       clear_buffer_dirty(bh);
+                       bh->b_bdev = NULL;
+                       clear_buffer_mapped(bh);
+                       clear_buffer_req(bh);
+                       clear_buffer_new(bh);
+                       clear_buffer_delay(bh);
+                       clear_buffer_unwritten(bh);
+                       clear_buffer_uptodate(bh);
+                       zero_user(page, pos, range_to_discard);
+                       BUFFER_TRACE(bh, "Buffer discarded");
+                       goto next;
+               }
+
+               /*
+                * If this block is not completely contained in the range
+                * to be discarded, then it is not going to be released. Because
+                * we need to keep this block, we need to make sure this part
+                * of the page is uptodate before we modify it by writeing
+                * partial zeros on it.
+                */
+               if (!buffer_mapped(bh)) {
+                       /*
+                        * Buffer head must be mapped before we can read
+                        * from the block
+                        */
+                       BUFFER_TRACE(bh, "unmapped");
+                       ext4_get_block(inode, iblock, bh, 0);
+                       /* unmapped? It's a hole - nothing to do */
+                       if (!buffer_mapped(bh)) {
+                               BUFFER_TRACE(bh, "still unmapped");
+                               goto next;
+                       }
+               }
+
+               /* Ok, it's mapped. Make sure it's up-to-date */
+               if (PageUptodate(page))
+                       set_buffer_uptodate(bh);
+
+               if (!buffer_uptodate(bh)) {
+                       err = -EIO;
+                       ll_rw_block(READ, 1, &bh);
+                       wait_on_buffer(bh);
+                       /* Uhhuh. Read error. Complain and punt.*/
+                       if (!buffer_uptodate(bh))
+                               goto next;
+               }
+
+               if (ext4_should_journal_data(inode)) {
+                       BUFFER_TRACE(bh, "get write access");
+                       err = ext4_journal_get_write_access(handle, bh);
+                       if (err)
+                               goto next;
+               }
+
+               zero_user(page, pos, range_to_discard);
+
+               err = 0;
+               if (ext4_should_journal_data(inode)) {
+                       err = ext4_handle_dirty_metadata(handle, inode, bh);
+               } else
+                       mark_buffer_dirty(bh);
+
+               BUFFER_TRACE(bh, "Partial buffer zeroed");
+next:
+               bh = bh->b_this_page;
+               iblock++;
+               pos += range_to_discard;
+       }
+
+       return err;
+}
+
 /*
  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
  * up to the end of the block which corresponds to `from'.
@@ -3005,7 +3376,7 @@ int ext4_block_zero_page_range(handle_t *handle,
        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
        if (!page)
-               return -EINVAL;
+               return -ENOMEM;
 
        blocksize = inode->i_sb->s_blocksize;
        max = blocksize - (offset & (blocksize - 1));
@@ -3074,11 +3445,8 @@ int ext4_block_zero_page_range(handle_t *handle,
        err = 0;
        if (ext4_should_journal_data(inode)) {
                err = ext4_handle_dirty_metadata(handle, inode, bh);
-       } else {
-               if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
-                       err = ext4_jbd2_file_inode(handle, inode);
+       } else
                mark_buffer_dirty(bh);
-       }
 
 unlock:
        unlock_page(page);
@@ -3119,6 +3487,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
                return -ENOTSUPP;
        }
 
+       if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
+               /* TODO: Add support for bigalloc file systems */
+               return -ENOTSUPP;
+       }
+
        return ext4_ext_punch_hole(file, offset, length);
 }
 
@@ -4420,6 +4793,7 @@ retry_alloc:
                          PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
                        unlock_page(page);
                        ret = VM_FAULT_SIGBUS;
+                       ext4_journal_stop(handle);
                        goto out;
                }
                ext4_set_inode_state(inode, EXT4_STATE_JDATA);
index f18bfe3..a567968 100644 (file)
@@ -21,6 +21,7 @@
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = filp->f_dentry->d_inode;
+       struct super_block *sb = inode->i_sb;
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int flags;
 
@@ -173,33 +174,8 @@ setversion_out:
                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
-#ifdef CONFIG_JBD2_DEBUG
-       case EXT4_IOC_WAIT_FOR_READONLY:
-               /*
-                * This is racy - by the time we're woken up and running,
-                * the superblock could be released.  And the module could
-                * have been unloaded.  So sue me.
-                *
-                * Returns 1 if it slept, else zero.
-                */
-               {
-                       struct super_block *sb = inode->i_sb;
-                       DECLARE_WAITQUEUE(wait, current);
-                       int ret = 0;
-
-                       set_current_state(TASK_INTERRUPTIBLE);
-                       add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
-                       if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
-                               schedule();
-                               ret = 1;
-                       }
-                       remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
-                       return ret;
-               }
-#endif
        case EXT4_IOC_GROUP_EXTEND: {
                ext4_fsblk_t n_blocks_count;
-               struct super_block *sb = inode->i_sb;
                int err, err2=0;
 
                err = ext4_resize_begin(sb);
@@ -209,6 +185,13 @@ setversion_out:
                if (get_user(n_blocks_count, (__u32 __user *)arg))
                        return -EFAULT;
 
+               if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                              EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+                       ext4_msg(sb, KERN_ERR,
+                                "Online resizing not supported with bigalloc");
+                       return -EOPNOTSUPP;
+               }
+
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
                        return err;
@@ -250,6 +233,13 @@ setversion_out:
                        goto mext_out;
                }
 
+               if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                              EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+                       ext4_msg(sb, KERN_ERR,
+                                "Online defrag not supported with bigalloc");
+                       return -EOPNOTSUPP;
+               }
+
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
                        goto mext_out;
@@ -270,7 +260,6 @@ mext_out:
 
        case EXT4_IOC_GROUP_ADD: {
                struct ext4_new_group_data input;
-               struct super_block *sb = inode->i_sb;
                int err, err2=0;
 
                err = ext4_resize_begin(sb);
@@ -281,6 +270,13 @@ mext_out:
                                sizeof(input)))
                        return -EFAULT;
 
+               if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                              EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+                       ext4_msg(sb, KERN_ERR,
+                                "Online resizing not supported with bigalloc");
+                       return -EOPNOTSUPP;
+               }
+
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
                        return err;
@@ -337,7 +333,6 @@ mext_out:
 
        case FITRIM:
        {
-               struct super_block *sb = inode->i_sb;
                struct request_queue *q = bdev_get_queue(sb->s_bdev);
                struct fstrim_range range;
                int ret = 0;
@@ -348,7 +343,14 @@ mext_out:
                if (!blk_queue_discard(q))
                        return -EOPNOTSUPP;
 
-               if (copy_from_user(&range, (struct fstrim_range *)arg,
+               if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                              EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+                       ext4_msg(sb, KERN_ERR,
+                                "FITRIM not supported with bigalloc");
+                       return -EOPNOTSUPP;
+               }
+
+               if (copy_from_user(&range, (struct fstrim_range __user *)arg,
                    sizeof(range)))
                        return -EFAULT;
 
@@ -358,7 +360,7 @@ mext_out:
                if (ret < 0)
                        return ret;
 
-               if (copy_to_user((struct fstrim_range *)arg, &range,
+               if (copy_to_user((struct fstrim_range __user *)arg, &range,
                    sizeof(range)))
                        return -EFAULT;
 
@@ -396,11 +398,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case EXT4_IOC32_SETVERSION_OLD:
                cmd = EXT4_IOC_SETVERSION_OLD;
                break;
-#ifdef CONFIG_JBD2_DEBUG
-       case EXT4_IOC32_WAIT_FOR_READONLY:
-               cmd = EXT4_IOC_WAIT_FOR_READONLY;
-               break;
-#endif
        case EXT4_IOC32_GETRSVSZ:
                cmd = EXT4_IOC_GETRSVSZ;
                break;
index 17a5a57..e2d8be8 100644 (file)
@@ -70,8 +70,8 @@
  *
  * pa_lstart -> the logical start block for this prealloc space
  * pa_pstart -> the physical start block for this prealloc space
- * pa_len    -> length for this prealloc space
- * pa_free   ->  free space available in this prealloc space
+ * pa_len    -> length for this prealloc space (in clusters)
+ * pa_free   ->  free space available in this prealloc space (in clusters)
  *
  * The inode preallocation space is used looking at the _logical_ start
  * block. If only the logical file block falls within the range of prealloc
  * list. In case of inode preallocation we follow a list of heuristics
  * based on file size. This can be found in ext4_mb_normalize_request. If
  * we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
+ * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
+ * dependent on the cluster size; for non-bigalloc file systems, it is
  * 512 blocks. This can be tuned via
  * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
  * terms of number of blocks. If we have mounted the file system with -O
@@ -459,7 +460,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
                        ext4_fsblk_t blocknr;
 
                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
-                       blocknr += first + i;
+                       blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
                        ext4_grp_locked_error(sb, e4b->bd_group,
                                              inode ? inode->i_ino : 0,
                                              blocknr,
@@ -580,7 +581,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
                                continue;
                        }
 
-                       /* both bits in buddy2 must be 0 */
+                       /* both bits in buddy2 must be 1 */
                        MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
                        MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
 
@@ -653,7 +654,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
        ext4_grpblk_t chunk;
        unsigned short border;
 
-       BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
+       BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
 
        border = 2 << sb->s_blocksize_bits;
 
@@ -705,7 +706,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
                                void *buddy, void *bitmap, ext4_group_t group)
 {
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-       ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+       ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
        ext4_grpblk_t i = 0;
        ext4_grpblk_t first;
        ext4_grpblk_t len;
@@ -734,7 +735,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 
        if (free != grp->bb_free) {
                ext4_grp_locked_error(sb, group, 0, 0,
-                                     "%u blocks in bitmap, %u in gd",
+                                     "%u clusters in bitmap, %u in gd",
                                      free, grp->bb_free);
                /*
                 * If we intent to continue, we consider group descritor
@@ -1339,7 +1340,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        ext4_fsblk_t blocknr;
 
                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
-                       blocknr += block;
+                       blocknr += EXT4_C2B(EXT4_SB(sb), block);
                        ext4_grp_locked_error(sb, e4b->bd_group,
                                              inode ? inode->i_ino : 0,
                                              blocknr,
@@ -1390,7 +1391,6 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
 {
        int next = block;
        int max;
-       int ord;
        void *buddy;
 
        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
@@ -1432,9 +1432,8 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
                if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
                        break;
 
-               ord = mb_find_order_for_block(e4b, next);
+               order = mb_find_order_for_block(e4b, next);
 
-               order = ord;
                block = next >> order;
                ex->fe_len += 1 << order;
        }
@@ -1624,8 +1623,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
        struct ext4_free_extent *gex = &ac->ac_g_ex;
 
        BUG_ON(ex->fe_len <= 0);
-       BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
-       BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+       BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+       BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
        BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
 
        ac->ac_found++;
@@ -1823,15 +1822,15 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 
        while (free && ac->ac_status == AC_STATUS_CONTINUE) {
                i = mb_find_next_zero_bit(bitmap,
-                                               EXT4_BLOCKS_PER_GROUP(sb), i);
-               if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
+                                               EXT4_CLUSTERS_PER_GROUP(sb), i);
+               if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
                        /*
                         * IF we have corrupt bitmap, we won't find any
                         * free blocks even though group info says we
                         * we have free blocks
                         */
                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
-                                       "%d free blocks as per "
+                                       "%d free clusters as per "
                                        "group info. But bitmap says 0",
                                        free);
                        break;
@@ -1841,7 +1840,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                BUG_ON(ex.fe_len <= 0);
                if (free < ex.fe_len) {
                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
-                                       "%d free blocks as per "
+                                       "%d free clusters as per "
                                        "group info. But got %d blocks",
                                        free, ex.fe_len);
                        /*
@@ -1887,7 +1886,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
        do_div(a, sbi->s_stripe);
        i = (a * sbi->s_stripe) - first_group_block;
 
-       while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
+       while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
                if (!mb_test_bit(i, bitmap)) {
                        max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
                        if (max >= sbi->s_stripe) {
@@ -2252,10 +2251,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
         */
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                meta_group_info[i]->bb_free =
-                       ext4_free_blocks_after_init(sb, group, desc);
+                       ext4_free_clusters_after_init(sb, group, desc);
        } else {
                meta_group_info[i]->bb_free =
-                       ext4_free_blks_count(sb, desc);
+                       ext4_free_group_clusters(sb, desc);
        }
 
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
@@ -2473,7 +2472,20 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        sbi->s_mb_stats = MB_DEFAULT_STATS;
        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
-       sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+       /*
+        * The default group preallocation is 512, which for 4k block
+        * sizes translates to 2 megabytes.  However for bigalloc file
+        * systems, this is probably too big (i.e, if the cluster size
+        * is 1 megabyte, then group preallocation size becomes half a
+        * gigabyte!).  As a default, we will keep a two megabyte
+        * group pralloc size for cluster sizes up to 64k, and after
+        * that, we will force a minimum group preallocation size of
+        * 32 clusters.  This translates to 8 megs when the cluster
+        * size is 256k, and 32 megs when the cluster size is 1 meg,
+        * which seems reasonable as a default.
+        */
+       sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
+                                      sbi->s_cluster_bits, 32);
        /*
         * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
         * to the lowest multiple of s_stripe which is bigger than
@@ -2490,7 +2502,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
        if (sbi->s_locality_groups == NULL) {
                ret = -ENOMEM;
-               goto out;
+               goto out_free_groupinfo_slab;
        }
        for_each_possible_cpu(i) {
                struct ext4_locality_group *lg;
@@ -2503,9 +2515,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 
        /* init file for buddy data */
        ret = ext4_mb_init_backend(sb);
-       if (ret != 0) {
-               goto out;
-       }
+       if (ret != 0)
+               goto out_free_locality_groups;
 
        if (sbi->s_proc)
                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
@@ -2513,11 +2524,19 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 
        if (sbi->s_journal)
                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+
+       return 0;
+
+out_free_locality_groups:
+       free_percpu(sbi->s_locality_groups);
+       sbi->s_locality_groups = NULL;
+out_free_groupinfo_slab:
+       ext4_groupinfo_destroy_slabs();
 out:
-       if (ret) {
-               kfree(sbi->s_mb_offsets);
-               kfree(sbi->s_mb_maxs);
-       }
+       kfree(sbi->s_mb_offsets);
+       sbi->s_mb_offsets = NULL;
+       kfree(sbi->s_mb_maxs);
+       sbi->s_mb_maxs = NULL;
        return ret;
 }
 
@@ -2602,11 +2621,13 @@ int ext4_mb_release(struct super_block *sb)
 }
 
 static inline int ext4_issue_discard(struct super_block *sb,
-               ext4_group_t block_group, ext4_grpblk_t block, int count)
+               ext4_group_t block_group, ext4_grpblk_t cluster, int count)
 {
        ext4_fsblk_t discard_block;
 
-       discard_block = block + ext4_group_first_block_no(sb, block_group);
+       discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
+                        ext4_group_first_block_no(sb, block_group));
+       count = EXT4_C2B(EXT4_SB(sb), count);
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);
        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
@@ -2633,7 +2654,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 
                if (test_opt(sb, DISCARD))
                        ext4_issue_discard(sb, entry->group,
-                                          entry->start_blk, entry->count);
+                                          entry->start_cluster, entry->count);
 
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -2646,7 +2667,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                ext4_lock_group(sb, entry->group);
                /* Take it out of per group rb tree */
                rb_erase(&entry->node, &(db->bb_free_root));
-               mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+               mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count);
 
                /*
                 * Clear the trimmed flag for the group so that the next
@@ -2752,7 +2773,7 @@ void ext4_exit_mballoc(void)
  */
 static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
-                               handle_t *handle, unsigned int reserv_blks)
+                               handle_t *handle, unsigned int reserv_clstrs)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_group_desc *gdp;
@@ -2783,7 +2804,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                goto out_err;
 
        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
-                       ext4_free_blks_count(sb, gdp));
+                       ext4_free_group_clusters(sb, gdp));
 
        err = ext4_journal_get_write_access(handle, gdp_bh);
        if (err)
@@ -2791,7 +2812,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 
        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
 
-       len = ac->ac_b_ex.fe_len;
+       len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
        if (!ext4_data_block_valid(sbi, block, len)) {
                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
                           "fs metadata\n", block, block+len);
@@ -2823,28 +2844,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                      ac->ac_b_ex.fe_len);
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-               ext4_free_blks_set(sb, gdp,
-                                       ext4_free_blocks_after_init(sb,
-                                       ac->ac_b_ex.fe_group, gdp));
+               ext4_free_group_clusters_set(sb, gdp,
+                                            ext4_free_clusters_after_init(sb,
+                                               ac->ac_b_ex.fe_group, gdp));
        }
-       len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
-       ext4_free_blks_set(sb, gdp, len);
+       len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
+       ext4_free_group_clusters_set(sb, gdp, len);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 
        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
-       percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+       percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
        /*
         * Now reduce the dirty block count also. Should not go negative
         */
        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
                /* release all the reserved blocks if non delalloc */
-               percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+               percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+                                  reserv_clstrs);
 
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi,
                                                          ac->ac_b_ex.fe_group);
                atomic_sub(ac->ac_b_ex.fe_len,
-                          &sbi->s_flex_groups[flex_group].free_blocks);
+                          &sbi->s_flex_groups[flex_group].free_clusters);
        }
 
        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -2886,6 +2908,7 @@ static noinline_for_stack void
 ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                                struct ext4_allocation_request *ar)
 {
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int bsbits, max;
        ext4_lblk_t end;
        loff_t size, orig_size, start_off;
@@ -2916,7 +2939,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 
        /* first, let's learn actual file size
         * given current request is allocated */
-       size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+       size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
        size = size << bsbits;
        if (size < i_size_read(ac->ac_inode))
                size = i_size_read(ac->ac_inode);
@@ -2988,7 +3011,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                        continue;
                }
 
-               pa_end = pa->pa_lstart + pa->pa_len;
+               pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+                                                 pa->pa_len);
 
                /* PA must not overlap original request */
                BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
@@ -3018,9 +3042,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        rcu_read_lock();
        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
                ext4_lblk_t pa_end;
+
                spin_lock(&pa->pa_lock);
                if (pa->pa_deleted == 0) {
-                       pa_end = pa->pa_lstart + pa->pa_len;
+                       pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+                                                         pa->pa_len);
                        BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
                }
                spin_unlock(&pa->pa_lock);
@@ -3036,14 +3062,14 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        }
        BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
                        start > ac->ac_o_ex.fe_logical);
-       BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+       BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
 
        /* now prepare goal request */
 
        /* XXX: is it better to align blocks WRT to logical
         * placement or satisfy big request as is */
        ac->ac_g_ex.fe_logical = start;
-       ac->ac_g_ex.fe_len = size;
+       ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
 
        /* define goal start in order to merge */
        if (ar->pright && (ar->lright == (start + size))) {
@@ -3112,14 +3138,16 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
                                struct ext4_prealloc_space *pa)
 {
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        ext4_fsblk_t start;
        ext4_fsblk_t end;
        int len;
 
        /* found preallocated blocks, use them */
        start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
-       end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
-       len = end - start;
+       end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+                 start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
+       len = EXT4_NUM_B2C(sbi, end - start);
        ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
                                        &ac->ac_b_ex.fe_start);
        ac->ac_b_ex.fe_len = len;
@@ -3127,7 +3155,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
        ac->ac_pa = pa;
 
        BUG_ON(start < pa->pa_pstart);
-       BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+       BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
        BUG_ON(pa->pa_free < len);
        pa->pa_free -= len;
 
@@ -3193,6 +3221,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
 static noinline_for_stack int
 ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 {
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int order, i;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_locality_group *lg;
@@ -3210,12 +3239,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
                /* all fields in this condition don't change,
                 * so we can skip locking for them */
                if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
-                       ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+                   ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
+                                              EXT4_C2B(sbi, pa->pa_len)))
                        continue;
 
                /* non-extent files can't have physical blocks past 2^32 */
                if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
-                       pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+                   (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
+                    EXT4_MAX_BLOCK_FILE_PHYS))
                        continue;
 
                /* found preallocated blocks, use them */
@@ -3291,7 +3322,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 
        while (n) {
                entry = rb_entry(n, struct ext4_free_data, node);
-               ext4_set_bits(bitmap, entry->start_blk, entry->count);
+               ext4_set_bits(bitmap, entry->start_cluster, entry->count);
                n = rb_next(n);
        }
        return;
@@ -3312,7 +3343,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
        ext4_group_t groupnr;
        ext4_grpblk_t start;
        int preallocated = 0;
-       int count = 0;
        int len;
 
        /* all form of preallocation discards first load group,
@@ -3335,7 +3365,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                BUG_ON(groupnr != group);
                ext4_set_bits(bitmap, start, len);
                preallocated += len;
-               count++;
        }
        mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
 }
@@ -3412,6 +3441,7 @@ static noinline_for_stack int
 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 {
        struct super_block *sb = ac->ac_sb;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_prealloc_space *pa;
        struct ext4_group_info *grp;
        struct ext4_inode_info *ei;
@@ -3443,16 +3473,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
                winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
 
                /* also, we should cover whole original request */
-               wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+               wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
 
                /* the smallest one defines real window */
                win = min(winl, wins);
 
-               offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+               offs = ac->ac_o_ex.fe_logical %
+                       EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
                if (offs && offs < win)
                        win = offs;
 
-               ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+               ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
+                       EXT4_B2C(sbi, win);
                BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
                BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
        }
@@ -3477,7 +3509,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        trace_ext4_mb_new_inode_pa(ac, pa);
 
        ext4_mb_use_inode_pa(ac, pa);
-       atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+       atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
 
        ei = EXT4_I(ac->ac_inode);
        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
@@ -3592,7 +3624,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
-       grp_blk_start = pa->pa_pstart - bit;
+       grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;
 
@@ -3607,7 +3639,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                free += next - bit;
 
                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
-               trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit,
+               trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
+                                                   EXT4_C2B(sbi, bit)),
                                               next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
@@ -3690,7 +3723,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        }
 
        if (needed == 0)
-               needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
+               needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
 
        INIT_LIST_HEAD(&list);
 repeat:
@@ -3958,7 +3991,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                return;
 
-       size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+       size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
        isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
                >> bsbits;
 
@@ -3969,6 +4002,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
                return;
        }
 
+       if (sbi->s_mb_group_prealloc <= 0) {
+               ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+               return;
+       }
+
        /* don't use group allocation for large files */
        size = max(size, isize);
        if (size > sbi->s_mb_stream_request) {
@@ -4007,8 +4045,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        len = ar->len;
 
        /* just a dirty hack to filter too big requests  */
-       if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
-               len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
+       if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10)
+               len = EXT4_CLUSTERS_PER_GROUP(sb) - 10;
 
        /* start searching from the goal */
        goal = ar->goal;
@@ -4019,18 +4057,15 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 
        /* set up allocation goals */
        memset(ac, 0, sizeof(struct ext4_allocation_context));
-       ac->ac_b_ex.fe_logical = ar->logical;
+       ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
        ac->ac_status = AC_STATUS_CONTINUE;
        ac->ac_sb = sb;
        ac->ac_inode = ar->inode;
-       ac->ac_o_ex.fe_logical = ar->logical;
+       ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
        ac->ac_o_ex.fe_group = group;
        ac->ac_o_ex.fe_start = block;
        ac->ac_o_ex.fe_len = len;
-       ac->ac_g_ex.fe_logical = ar->logical;
-       ac->ac_g_ex.fe_group = group;
-       ac->ac_g_ex.fe_start = block;
-       ac->ac_g_ex.fe_len = len;
+       ac->ac_g_ex = ac->ac_o_ex;
        ac->ac_flags = ar->flags;
 
        /* we have to define context: we'll we work with a file or
@@ -4182,13 +4217,14 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
  */
 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_prealloc_space *pa = ac->ac_pa;
        if (pa) {
                if (pa->pa_type == MB_GROUP_PA) {
                        /* see comment in ext4_mb_use_group_pa() */
                        spin_lock(&pa->pa_lock);
-                       pa->pa_pstart += ac->ac_b_ex.fe_len;
-                       pa->pa_lstart += ac->ac_b_ex.fe_len;
+                       pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+                       pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
                        pa->pa_free -= ac->ac_b_ex.fe_len;
                        pa->pa_len -= ac->ac_b_ex.fe_len;
                        spin_unlock(&pa->pa_lock);
@@ -4249,13 +4285,17 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        struct super_block *sb;
        ext4_fsblk_t block = 0;
        unsigned int inquota = 0;
-       unsigned int reserv_blks = 0;
+       unsigned int reserv_clstrs = 0;
 
        sb = ar->inode->i_sb;
        sbi = EXT4_SB(sb);
 
        trace_ext4_request_blocks(ar);
 
+       /* Allow to use superuser reservation for quota file */
+       if (IS_NOQUOTA(ar->inode))
+               ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
+
        /*
         * For delayed allocation, we could skip the ENOSPC and
         * EDQUOT check, as blocks and quotas have been already
@@ -4269,7 +4309,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                 * and verify allocation doesn't exceed the quota limits.
                 */
                while (ar->len &&
-                       ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
+                       ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
 
                        /* let others to free the space */
                        yield();
@@ -4279,12 +4319,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                        *errp = -ENOSPC;
                        return 0;
                }
-               reserv_blks = ar->len;
+               reserv_clstrs = ar->len;
                if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
-                       dquot_alloc_block_nofail(ar->inode, ar->len);
+                       dquot_alloc_block_nofail(ar->inode,
+                                                EXT4_C2B(sbi, ar->len));
                } else {
                        while (ar->len &&
-                               dquot_alloc_block(ar->inode, ar->len)) {
+                               dquot_alloc_block(ar->inode,
+                                                 EXT4_C2B(sbi, ar->len))) {
 
                                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
                                ar->len--;
@@ -4328,7 +4370,7 @@ repeat:
                        ext4_mb_new_preallocation(ac);
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
-               *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
+               *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
                if (*errp == -EAGAIN) {
                        /*
                         * drop the reference that we took
@@ -4364,13 +4406,13 @@ out:
        if (ac)
                kmem_cache_free(ext4_ac_cachep, ac);
        if (inquota && ar->len < inquota)
-               dquot_free_block(ar->inode, inquota - ar->len);
+               dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
        if (!ar->len) {
                if (!ext4_test_inode_state(ar->inode,
                                           EXT4_STATE_DELALLOC_RESERVED))
                        /* release all the reserved blocks if non delalloc */
-                       percpu_counter_sub(&sbi->s_dirtyblocks_counter,
-                                               reserv_blks);
+                       percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+                                               reserv_clstrs);
        }
 
        trace_ext4_allocate_blocks(ar, (unsigned long long)block);
@@ -4388,7 +4430,7 @@ static int can_merge(struct ext4_free_data *entry1,
 {
        if ((entry1->t_tid == entry2->t_tid) &&
            (entry1->group == entry2->group) &&
-           ((entry1->start_blk + entry1->count) == entry2->start_blk))
+           ((entry1->start_cluster + entry1->count) == entry2->start_cluster))
                return 1;
        return 0;
 }
@@ -4398,7 +4440,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                      struct ext4_free_data *new_entry)
 {
        ext4_group_t group = e4b->bd_group;
-       ext4_grpblk_t block;
+       ext4_grpblk_t cluster;
        struct ext4_free_data *entry;
        struct ext4_group_info *db = e4b->bd_info;
        struct super_block *sb = e4b->bd_sb;
@@ -4411,7 +4453,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        BUG_ON(e4b->bd_buddy_page == NULL);
 
        new_node = &new_entry->node;
-       block = new_entry->start_blk;
+       cluster = new_entry->start_cluster;
 
        if (!*n) {
                /* first free block exent. We need to
@@ -4425,13 +4467,14 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        while (*n) {
                parent = *n;
                entry = rb_entry(parent, struct ext4_free_data, node);
-               if (block < entry->start_blk)
+               if (cluster < entry->start_cluster)
                        n = &(*n)->rb_left;
-               else if (block >= (entry->start_blk + entry->count))
+               else if (cluster >= (entry->start_cluster + entry->count))
                        n = &(*n)->rb_right;
                else {
                        ext4_grp_locked_error(sb, group, 0,
-                               ext4_group_first_block_no(sb, group) + block,
+                               ext4_group_first_block_no(sb, group) +
+                               EXT4_C2B(sbi, cluster),
                                "Block already on to-be-freed list");
                        return 0;
                }
@@ -4445,7 +4488,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        if (node) {
                entry = rb_entry(node, struct ext4_free_data, node);
                if (can_merge(entry, new_entry)) {
-                       new_entry->start_blk = entry->start_blk;
+                       new_entry->start_cluster = entry->start_cluster;
                        new_entry->count += entry->count;
                        rb_erase(node, &(db->bb_free_root));
                        spin_lock(&sbi->s_md_lock);
@@ -4496,6 +4539,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        ext4_group_t block_group;
        struct ext4_sb_info *sbi;
        struct ext4_buddy e4b;
+       unsigned int count_clusters;
        int err = 0;
        int ret;
 
@@ -4544,6 +4588,38 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        if (!ext4_should_writeback_data(inode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
 
+       /*
+        * If the extent to be freed does not begin on a cluster
+        * boundary, we need to deal with partial clusters at the
+        * beginning and end of the extent.  Normally we will free
+        * blocks at the beginning or the end unless we are explicitly
+        * requested to avoid doing so.
+        */
+       overflow = block & (sbi->s_cluster_ratio - 1);
+       if (overflow) {
+               if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+                       overflow = sbi->s_cluster_ratio - overflow;
+                       block += overflow;
+                       if (count > overflow)
+                               count -= overflow;
+                       else
+                               return;
+               } else {
+                       block -= overflow;
+                       count += overflow;
+               }
+       }
+       overflow = count & (sbi->s_cluster_ratio - 1);
+       if (overflow) {
+               if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+                       if (count > overflow)
+                               count -= overflow;
+                       else
+                               return;
+               } else
+                       count += sbi->s_cluster_ratio - overflow;
+       }
+
 do_more:
        overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4552,10 +4628,12 @@ do_more:
         * Check to see if we are freeing blocks across a group
         * boundary.
         */
-       if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-               overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+       if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+               overflow = EXT4_C2B(sbi, bit) + count -
+                       EXT4_BLOCKS_PER_GROUP(sb);
                count -= overflow;
        }
+       count_clusters = EXT4_B2C(sbi, count);
        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh) {
                err = -EIO;
@@ -4570,9 +4648,9 @@ do_more:
        if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
            in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
            in_range(block, ext4_inode_table(sb, gdp),
-                     EXT4_SB(sb)->s_itb_per_group) ||
+                    EXT4_SB(sb)->s_itb_per_group) ||
            in_range(block + count - 1, ext4_inode_table(sb, gdp),
-                     EXT4_SB(sb)->s_itb_per_group)) {
+                    EXT4_SB(sb)->s_itb_per_group)) {
 
                ext4_error(sb, "Freeing blocks in system zone - "
                           "Block = %llu, count = %lu", block, count);
@@ -4597,11 +4675,11 @@ do_more:
 #ifdef AGGRESSIVE_CHECK
        {
                int i;
-               for (i = 0; i < count; i++)
+               for (i = 0; i < count_clusters; i++)
                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
        }
 #endif
-       trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
+       trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
 
        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
@@ -4618,13 +4696,13 @@ do_more:
                        err = -ENOMEM;
                        goto error_return;
                }
-               new_entry->start_blk = bit;
+               new_entry->start_cluster = bit;
                new_entry->group  = block_group;
-               new_entry->count = count;
+               new_entry->count = count_clusters;
                new_entry->t_tid = handle->h_transaction->t_tid;
 
                ext4_lock_group(sb, block_group);
-               mb_clear_bits(bitmap_bh->b_data, bit, count);
+               mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
                ext4_mb_free_metadata(handle, &e4b, new_entry);
        } else {
                /* need to update group_info->bb_free and bitmap
@@ -4632,25 +4710,29 @@ do_more:
                 * them with group lock_held
                 */
                ext4_lock_group(sb, block_group);
-               mb_clear_bits(bitmap_bh->b_data, bit, count);
-               mb_free_blocks(inode, &e4b, bit, count);
+               mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+               mb_free_blocks(inode, &e4b, bit, count_clusters);
        }
 
-       ret = ext4_free_blks_count(sb, gdp) + count;
-       ext4_free_blks_set(sb, gdp, ret);
+       ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
+       ext4_free_group_clusters_set(sb, gdp, ret);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
        ext4_unlock_group(sb, block_group);
-       percpu_counter_add(&sbi->s_freeblocks_counter, count);
+       percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
 
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
+               atomic_add(count_clusters,
+                          &sbi->s_flex_groups[flex_group].free_clusters);
        }
 
        ext4_mb_unload_buddy(&e4b);
 
        freed += count;
 
+       if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+               dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4669,8 +4751,6 @@ do_more:
        }
        ext4_mark_super_dirty(sb);
 error_return:
-       if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
-               dquot_free_block(inode, freed);
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
        return;
@@ -4778,16 +4858,17 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
        ext4_lock_group(sb, block_group);
        mb_clear_bits(bitmap_bh->b_data, bit, count);
        mb_free_blocks(NULL, &e4b, bit, count);
-       blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
-       ext4_free_blks_set(sb, desc, blk_free_count);
+       blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
+       ext4_free_group_clusters_set(sb, desc, blk_free_count);
        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
        ext4_unlock_group(sb, block_group);
-       percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+       percpu_counter_add(&sbi->s_freeclusters_counter,
+                          EXT4_B2C(sbi, blocks_freed));
 
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               atomic_add(blocks_freed,
-                          &sbi->s_flex_groups[flex_group].free_blocks);
+               atomic_add(EXT4_B2C(sbi, blocks_freed),
+                          &sbi->s_flex_groups[flex_group].free_clusters);
        }
 
        ext4_mb_unload_buddy(&e4b);
@@ -4948,7 +5029,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        struct ext4_group_info *grp;
        ext4_group_t first_group, last_group;
        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
-       ext4_grpblk_t cnt = 0, first_block, last_block;
+       ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
        uint64_t start, len, minlen, trimmed = 0;
        ext4_fsblk_t first_data_blk =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
@@ -4958,7 +5039,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        len = range->len >> sb->s_blocksize_bits;
        minlen = range->minlen >> sb->s_blocksize_bits;
 
-       if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+       if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)))
                return -EINVAL;
        if (start + len <= first_data_blk)
                goto out;
@@ -4969,11 +5050,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 
        /* Determine first and last group to examine based on start and len */
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
-                                    &first_group, &first_block);
+                                    &first_group, &first_cluster);
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
-                                    &last_group, &last_block);
+                                    &last_group, &last_cluster);
        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
-       last_block = EXT4_BLOCKS_PER_GROUP(sb);
+       last_cluster = EXT4_CLUSTERS_PER_GROUP(sb);
 
        if (first_group > last_group)
                return -EINVAL;
@@ -4993,20 +5074,20 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                 * change it for the last group in which case start +
                 * len < EXT4_BLOCKS_PER_GROUP(sb).
                 */
-               if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
-                       last_block = first_block + len;
-               len -= last_block - first_block;
+               if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb))
+                       last_cluster = first_cluster + len;
+               len -= last_cluster - first_cluster;
 
                if (grp->bb_free >= minlen) {
-                       cnt = ext4_trim_all_free(sb, group, first_block,
-                                               last_block, minlen);
+                       cnt = ext4_trim_all_free(sb, group, first_cluster,
+                                               last_cluster, minlen);
                        if (cnt < 0) {
                                ret = cnt;
                                break;
                        }
                }
                trimmed += cnt;
-               first_block = 0;
+               first_cluster = 0;
        }
        range->len = trimmed * sb->s_blocksize;
 
index 9d4a636..47705f3 100644 (file)
@@ -106,7 +106,7 @@ struct ext4_free_data {
        ext4_group_t group;
 
        /* free block extent */
-       ext4_grpblk_t start_blk;
+       ext4_grpblk_t start_cluster;
        ext4_grpblk_t count;
 
        /* transaction which freed this extent */
@@ -139,9 +139,9 @@ enum {
 
 struct ext4_free_extent {
        ext4_lblk_t fe_logical;
-       ext4_grpblk_t fe_start;
+       ext4_grpblk_t fe_start; /* In cluster units */
        ext4_group_t fe_group;
-       ext4_grpblk_t fe_len;
+       ext4_grpblk_t fe_len;   /* In cluster units */
 };
 
 /*
@@ -175,7 +175,7 @@ struct ext4_allocation_context {
        /* the best found extent */
        struct ext4_free_extent ac_b_ex;
 
-       /* copy of the bext found extent taken before preallocation efforts */
+       /* copy of the best found extent taken before preallocation efforts */
        struct ext4_free_extent ac_f_ex;
 
        /* number of iterations done. we have to track to limit searching */
@@ -216,6 +216,7 @@ struct ext4_buddy {
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
 {
-       return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
+       return ext4_group_first_block_no(sb, fex->fe_group) +
+               (fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
 }
 #endif
index b57b98f..f729377 100644 (file)
 #include <linux/module.h>
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
 
 /*
  * The contiguous blocks details which can be
  * represented by a single extent
  */
-struct list_blocks_struct {
-       ext4_lblk_t first_block, last_block;
+struct migrate_struct {
+       ext4_lblk_t first_block, last_block, curr_block;
        ext4_fsblk_t first_pblock, last_pblock;
 };
 
 static int finish_range(handle_t *handle, struct inode *inode,
-                               struct list_blocks_struct *lb)
+                               struct migrate_struct *lb)
 
 {
        int retval = 0, needed;
@@ -87,8 +86,7 @@ err_out:
 }
 
 static int update_extent_range(handle_t *handle, struct inode *inode,
-                               ext4_fsblk_t pblock, ext4_lblk_t blk_num,
-                               struct list_blocks_struct *lb)
+                              ext4_fsblk_t pblock, struct migrate_struct *lb)
 {
        int retval;
        /*
@@ -96,9 +94,10 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
         */
        if (lb->first_pblock &&
                (lb->last_pblock+1 == pblock) &&
-               (lb->last_block+1 == blk_num)) {
+               (lb->last_block+1 == lb->curr_block)) {
                lb->last_pblock = pblock;
-               lb->last_block = blk_num;
+               lb->last_block = lb->curr_block;
+               lb->curr_block++;
                return 0;
        }
        /*
@@ -106,64 +105,49 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
         */
        retval = finish_range(handle, inode, lb);
        lb->first_pblock = lb->last_pblock = pblock;
-       lb->first_block = lb->last_block = blk_num;
-
+       lb->first_block = lb->last_block = lb->curr_block;
+       lb->curr_block++;
        return retval;
 }
 
 static int update_ind_extent_range(handle_t *handle, struct inode *inode,
-                                  ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
-                                  struct list_blocks_struct *lb)
+                                  ext4_fsblk_t pblock,
+                                  struct migrate_struct *lb)
 {
        struct buffer_head *bh;
        __le32 *i_data;
        int i, retval = 0;
-       ext4_lblk_t blk_count = *blk_nump;
        unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-       if (!pblock) {
-               /* Only update the file block number */
-               *blk_nump += max_entries;
-               return 0;
-       }
-
        bh = sb_bread(inode->i_sb, pblock);
        if (!bh)
                return -EIO;
 
        i_data = (__le32 *)bh->b_data;
-       for (i = 0; i < max_entries; i++, blk_count++) {
+       for (i = 0; i < max_entries; i++) {
                if (i_data[i]) {
                        retval = update_extent_range(handle, inode,
-                                               le32_to_cpu(i_data[i]),
-                                               blk_count, lb);
+                                               le32_to_cpu(i_data[i]), lb);
                        if (retval)
                                break;
+               } else {
+                       lb->curr_block++;
                }
        }
-
-       /* Update the file block number */
-       *blk_nump = blk_count;
        put_bh(bh);
        return retval;
 
 }
 
 static int update_dind_extent_range(handle_t *handle, struct inode *inode,
-                                   ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
-                                   struct list_blocks_struct *lb)
+                                   ext4_fsblk_t pblock,
+                                   struct migrate_struct *lb)
 {
        struct buffer_head *bh;
        __le32 *i_data;
        int i, retval = 0;
-       ext4_lblk_t blk_count = *blk_nump;
        unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-       if (!pblock) {
-               /* Only update the file block number */
-               *blk_nump += max_entries * max_entries;
-               return 0;
-       }
        bh = sb_bread(inode->i_sb, pblock);
        if (!bh)
                return -EIO;
@@ -172,38 +156,28 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode,
        for (i = 0; i < max_entries; i++) {
                if (i_data[i]) {
                        retval = update_ind_extent_range(handle, inode,
-                                               le32_to_cpu(i_data[i]),
-                                               &blk_count, lb);
+                                               le32_to_cpu(i_data[i]), lb);
                        if (retval)
                                break;
                } else {
                        /* Only update the file block number */
-                       blk_count += max_entries;
+                       lb->curr_block += max_entries;
                }
        }
-
-       /* Update the file block number */
-       *blk_nump = blk_count;
        put_bh(bh);
        return retval;
 
 }
 
 static int update_tind_extent_range(handle_t *handle, struct inode *inode,
-                                    ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
-                                    struct list_blocks_struct *lb)
+                                   ext4_fsblk_t pblock,
+                                   struct migrate_struct *lb)
 {
        struct buffer_head *bh;
        __le32 *i_data;
        int i, retval = 0;
-       ext4_lblk_t blk_count = *blk_nump;
        unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-       if (!pblock) {
-               /* Only update the file block number */
-               *blk_nump += max_entries * max_entries * max_entries;
-               return 0;
-       }
        bh = sb_bread(inode->i_sb, pblock);
        if (!bh)
                return -EIO;
@@ -212,16 +186,14 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
        for (i = 0; i < max_entries; i++) {
                if (i_data[i]) {
                        retval = update_dind_extent_range(handle, inode,
-                                               le32_to_cpu(i_data[i]),
-                                               &blk_count, lb);
+                                               le32_to_cpu(i_data[i]), lb);
                        if (retval)
                                break;
-               } else
+               } else {
                        /* Only update the file block number */
-                       blk_count += max_entries * max_entries;
+                       lb->curr_block += max_entries * max_entries;
+               }
        }
-       /* Update the file block number */
-       *blk_nump = blk_count;
        put_bh(bh);
        return retval;
 
@@ -462,12 +434,12 @@ int ext4_ext_migrate(struct inode *inode)
        handle_t *handle;
        int retval = 0, i;
        __le32 *i_data;
-       ext4_lblk_t blk_count = 0;
        struct ext4_inode_info *ei;
        struct inode *tmp_inode = NULL;
-       struct list_blocks_struct lb;
+       struct migrate_struct lb;
        unsigned long max_entries;
        __u32 goal;
+       uid_t owner[2];
 
        /*
         * If the filesystem does not support extents, or the inode
@@ -495,10 +467,12 @@ int ext4_ext_migrate(struct inode *inode)
        }
        goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
                EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
+       owner[0] = inode->i_uid;
+       owner[1] = inode->i_gid;
        tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-                                  S_IFREG, NULL, goal);
+                                  S_IFREG, NULL, goal, owner);
        if (IS_ERR(tmp_inode)) {
-               retval = -ENOMEM;
+               retval = PTR_ERR(inode);
                ext4_journal_stop(handle);
                return retval;
        }
@@ -551,35 +525,32 @@ int ext4_ext_migrate(struct inode *inode)
 
        /* 32 bit block address 4 bytes */
        max_entries = inode->i_sb->s_blocksize >> 2;
-       for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) {
+       for (i = 0; i < EXT4_NDIR_BLOCKS; i++) {
                if (i_data[i]) {
                        retval = update_extent_range(handle, tmp_inode,
-                                               le32_to_cpu(i_data[i]),
-                                               blk_count, &lb);
+                                               le32_to_cpu(i_data[i]), &lb);
                        if (retval)
                                goto err_out;
-               }
+               } else
+                       lb.curr_block++;
        }
        if (i_data[EXT4_IND_BLOCK]) {
                retval = update_ind_extent_range(handle, tmp_inode,
-                                       le32_to_cpu(i_data[EXT4_IND_BLOCK]),
-                                       &blk_count, &lb);
+                               le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
                        if (retval)
                                goto err_out;
        } else
-               blk_count +=  max_entries;
+               lb.curr_block += max_entries;
        if (i_data[EXT4_DIND_BLOCK]) {
                retval = update_dind_extent_range(handle, tmp_inode,
-                                       le32_to_cpu(i_data[EXT4_DIND_BLOCK]),
-                                       &blk_count, &lb);
+                               le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
                        if (retval)
                                goto err_out;
        } else
-               blk_count += max_entries * max_entries;
+               lb.curr_block += max_entries * max_entries;
        if (i_data[EXT4_TIND_BLOCK]) {
                retval = update_tind_extent_range(handle, tmp_inode,
-                                       le32_to_cpu(i_data[EXT4_TIND_BLOCK]),
-                                       &blk_count, &lb);
+                               le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
                        if (retval)
                                goto err_out;
        }
index 9bdef3f..7ea4ba4 100644 (file)
@@ -109,7 +109,7 @@ static int kmmpd(void *data)
        mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
        bdevname(bh->b_bdev, mmp->mmp_bdevname);
 
-       memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+       memcpy(mmp->mmp_nodename, init_utsname()->nodename,
               sizeof(mmp->mmp_nodename));
 
        while (!kthread_should_stop()) {
@@ -125,8 +125,9 @@ static int kmmpd(void *data)
                 * Don't spew too many error messages. Print one every
                 * (s_mmp_update_interval * 60) seconds.
                 */
-               if (retval && (failed_writes % 60) == 0) {
-                       ext4_error(sb, "Error writing to MMP block");
+               if (retval) {
+                       if ((failed_writes % 60) == 0)
+                               ext4_error(sb, "Error writing to MMP block");
                        failed_writes++;
                }
 
@@ -295,7 +296,8 @@ skip:
        /*
         * write a new random sequence number.
         */
-       mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+       seq = mmp_new_seq();
+       mmp->mmp_seq = cpu_to_le32(seq);
 
        retval = write_mmp_block(bh);
        if (retval)
index f57455a..c5826c6 100644 (file)
@@ -17,7 +17,6 @@
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
-#include "ext4_extents.h"
 #include "ext4.h"
 
 /**
index 1c924fa..2a75eed 100644 (file)
@@ -1586,7 +1586,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        dxtrace(dx_show_index("node", frames[1].entries));
                        dxtrace(dx_show_index("node",
                               ((struct dx_node *) bh2->b_data)->entries));
-                       err = ext4_handle_dirty_metadata(handle, inode, bh2);
+                       err = ext4_handle_dirty_metadata(handle, dir, bh2);
                        if (err)
                                goto journal_error;
                        brelse (bh2);
@@ -1612,7 +1612,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-               err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
+               err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
                if (err) {
                        ext4_std_error(inode->i_sb, err);
                        goto cleanup;
@@ -1707,9 +1707,8 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
  */
 static void ext4_dec_count(handle_t *handle, struct inode *inode)
 {
-       drop_nlink(inode);
-       if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
-               inc_nlink(inode);
+       if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+               drop_nlink(inode);
 }
 
 
@@ -1756,7 +1755,7 @@ retry:
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
 
-       inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
+       inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
@@ -1792,7 +1791,7 @@ retry:
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
 
-       inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
+       inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
@@ -1832,7 +1831,7 @@ retry:
                ext4_handle_sync(handle);
 
        inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
-                              &dentry->d_name, 0);
+                              &dentry->d_name, 0, NULL);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -1863,7 +1862,7 @@ retry:
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-       err = ext4_handle_dirty_metadata(handle, dir, dir_block);
+       err = ext4_handle_dirty_metadata(handle, inode, dir_block);
        if (err)
                goto out_clear_inode;
        err = ext4_mark_inode_dirty(handle, inode);
@@ -2279,7 +2278,7 @@ retry:
                ext4_handle_sync(handle);
 
        inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
-                              &dentry->d_name, 0);
+                              &dentry->d_name, 0, NULL);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -2530,7 +2529,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-               retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
+               retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
                if (retval) {
                        ext4_std_error(old_dir->i_sb, retval);
                        goto end_rename;
index 92f38ee..7ce1d0b 100644 (file)
@@ -70,7 +70,6 @@ static void put_io_page(struct ext4_io_page *io_page)
 void ext4_free_io_end(ext4_io_end_t *io)
 {
        int i;
-       wait_queue_head_t *wq;
 
        BUG_ON(!io);
        if (io->page)
@@ -78,56 +77,43 @@ void ext4_free_io_end(ext4_io_end_t *io)
        for (i = 0; i < io->num_io_pages; i++)
                put_io_page(io->pages[i]);
        io->num_io_pages = 0;
-       wq = ext4_ioend_wq(io->inode);
-       if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
-           waitqueue_active(wq))
-               wake_up_all(wq);
+       if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
+               wake_up_all(ext4_ioend_wq(io->inode));
        kmem_cache_free(io_end_cachep, io);
 }
 
 /*
  * check a range of space and convert unwritten extents to written.
+ *
+ * Called with inode->i_mutex; we depend on this when we manipulate
+ * io->flag, since we could otherwise race with ext4_flush_completed_IO()
  */
 int ext4_end_io_nolock(ext4_io_end_t *io)
 {
        struct inode *inode = io->inode;
        loff_t offset = io->offset;
        ssize_t size = io->size;
-       wait_queue_head_t *wq;
        int ret = 0;
 
        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
                   "list->prev 0x%p\n",
                   io, inode->i_ino, io->list.next, io->list.prev);
 
-       if (list_empty(&io->list))
-               return ret;
-
-       if (!(io->flag & EXT4_IO_END_UNWRITTEN))
-               return ret;
-
        ret = ext4_convert_unwritten_extents(inode, offset, size);
        if (ret < 0) {
-               printk(KERN_EMERG "%s: failed to convert unwritten "
-                       "extents to written extents, error is %d "
-                       "io is still on inode %lu aio dio list\n",
-                      __func__, ret, inode->i_ino);
-               return ret;
+               ext4_msg(inode->i_sb, KERN_EMERG,
+                        "failed to convert unwritten extents to written "
+                        "extents -- potential data loss!  "
+                        "(inode %lu, offset %llu, size %zd, error %d)",
+                        inode->i_ino, offset, size, ret);
        }
 
        if (io->iocb)
                aio_complete(io->iocb, io->result, 0);
-       /* clear the DIO AIO unwritten flag */
-       if (io->flag & EXT4_IO_END_UNWRITTEN) {
-               io->flag &= ~EXT4_IO_END_UNWRITTEN;
-               /* Wake up anyone waiting on unwritten extent conversion */
-               wq = ext4_ioend_wq(io->inode);
-               if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
-                   waitqueue_active(wq)) {
-                       wake_up_all(wq);
-               }
-       }
 
+       /* Wake up anyone waiting on unwritten extent conversion */
+       if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
+               wake_up_all(ext4_ioend_wq(io->inode));
        return ret;
 }
 
@@ -140,9 +126,15 @@ static void ext4_end_io_work(struct work_struct *work)
        struct inode            *inode = io->inode;
        struct ext4_inode_info  *ei = EXT4_I(inode);
        unsigned long           flags;
-       int                     ret;
+
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+       if (list_empty(&io->list)) {
+               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+               goto free;
+       }
 
        if (!mutex_trylock(&inode->i_mutex)) {
+               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
                /*
                 * Requeue the work instead of waiting so that the work
                 * items queued after this can be processed.
@@ -159,17 +151,11 @@ static void ext4_end_io_work(struct work_struct *work)
                io->flag |= EXT4_IO_END_QUEUED;
                return;
        }
-       ret = ext4_end_io_nolock(io);
-       if (ret < 0) {
-               mutex_unlock(&inode->i_mutex);
-               return;
-       }
-
-       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       if (!list_empty(&io->list))
-               list_del_init(&io->list);
+       list_del_init(&io->list);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+       (void) ext4_end_io_nolock(io);
        mutex_unlock(&inode->i_mutex);
+free:
        ext4_free_io_end(io);
 }
 
@@ -350,10 +336,8 @@ submit_and_retry:
        if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
            (io_end->pages[io_end->num_io_pages-1] != io_page))
                goto submit_and_retry;
-       if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-               io_end->flag |= EXT4_IO_END_UNWRITTEN;
-               atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-       }
+       if (buffer_uninit(bh))
+               ext4_set_io_unwritten_flag(inode, io_end);
        io->io_end->size += bh->b_size;
        io->io_next_block++;
        ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
index 707d3f1..996780a 100644 (file)
@@ -875,7 +875,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
        ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
        ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
-       ext4_free_blks_set(sb, gdp, input->free_blocks_count);
+       ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count);
        ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
        gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
@@ -937,8 +937,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                input->reserved_blocks);
 
        /* Update the free space counts */
-       percpu_counter_add(&sbi->s_freeblocks_counter,
-                          input->free_blocks_count);
+       percpu_counter_add(&sbi->s_freeclusters_counter,
+                          EXT4_B2C(sbi, input->free_blocks_count));
        percpu_counter_add(&sbi->s_freeinodes_counter,
                           EXT4_INODES_PER_GROUP(sb));
 
@@ -946,8 +946,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
            sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group;
                flex_group = ext4_flex_group(sbi, input->group);
-               atomic_add(input->free_blocks_count,
-                          &sbi->s_flex_groups[flex_group].free_blocks);
+               atomic_add(EXT4_B2C(sbi, input->free_blocks_count),
+                          &sbi->s_flex_groups[flex_group].free_clusters);
                atomic_add(EXT4_INODES_PER_GROUP(sb),
                           &sbi->s_flex_groups[flex_group].free_inodes);
        }
index 44d0c8d..9953d80 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/freezer.h>
 
 #include "ext4.h"
+#include "ext4_extents.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -163,8 +164,8 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
 
-__u32 ext4_free_blks_count(struct super_block *sb,
-                             struct ext4_group_desc *bg)
+__u32 ext4_free_group_clusters(struct super_block *sb,
+                              struct ext4_group_desc *bg)
 {
        return le16_to_cpu(bg->bg_free_blocks_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
@@ -219,8 +220,8 @@ void ext4_inode_table_set(struct super_block *sb,
                bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 }
 
-void ext4_free_blks_set(struct super_block *sb,
-                         struct ext4_group_desc *bg, __u32 count)
+void ext4_free_group_clusters_set(struct super_block *sb,
+                                 struct ext4_group_desc *bg, __u32 count)
 {
        bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
@@ -414,6 +415,22 @@ static void save_error_info(struct super_block *sb, const char *func,
        ext4_commit_super(sb, 1);
 }
 
+/*
+ * The del_gendisk() function uninitializes the disk-specific data
+ * structures, including the bdi structure, without telling anyone
+ * else.  Once this happens, any attempt to call mark_buffer_dirty()
+ * (for example, by ext4_commit_super), will cause a kernel OOPS.
+ * This is a kludge to prevent these oops until we can put in a proper
+ * hook in del_gendisk() to inform the VFS and file system layers.
+ */
+static int block_device_ejected(struct super_block *sb)
+{
+       struct inode *bd_inode = sb->s_bdev->bd_inode;
+       struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+
+       return bdi->dev == NULL;
+}
+
 
 /* Deal with the reporting of failure conditions on a filesystem such as
  * inconsistencies detected or read IO failures.
@@ -821,10 +838,10 @@ static void ext4_put_super(struct super_block *sb)
                brelse(sbi->s_group_desc[i]);
        ext4_kvfree(sbi->s_group_desc);
        ext4_kvfree(sbi->s_flex_groups);
-       percpu_counter_destroy(&sbi->s_freeblocks_counter);
+       percpu_counter_destroy(&sbi->s_freeclusters_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
-       percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+       percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
        brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
        for (i = 0; i < MAXQUOTAS; i++)
@@ -1057,8 +1074,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",nouid32");
        if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
                seq_puts(seq, ",debug");
-       if (test_opt(sb, OLDALLOC))
-               seq_puts(seq, ",oldalloc");
 #ifdef CONFIG_EXT4_FS_XATTR
        if (test_opt(sb, XATTR_USER))
                seq_puts(seq, ",user_xattr");
@@ -1567,10 +1582,12 @@ static int parse_options(char *options, struct super_block *sb,
                        set_opt(sb, DEBUG);
                        break;
                case Opt_oldalloc:
-                       set_opt(sb, OLDALLOC);
+                       ext4_msg(sb, KERN_WARNING,
+                                "Ignoring deprecated oldalloc option");
                        break;
                case Opt_orlov:
-                       clear_opt(sb, OLDALLOC);
+                       ext4_msg(sb, KERN_WARNING,
+                                "Ignoring deprecated orlov option");
                        break;
 #ifdef CONFIG_EXT4_FS_XATTR
                case Opt_user_xattr:
@@ -1801,6 +1818,7 @@ set_qf_format:
                        break;
                case Opt_nodelalloc:
                        clear_opt(sb, DELALLOC);
+                       clear_opt2(sb, EXPLICIT_DELALLOC);
                        break;
                case Opt_mblk_io_submit:
                        set_opt(sb, MBLK_IO_SUBMIT);
@@ -1817,6 +1835,7 @@ set_qf_format:
                        break;
                case Opt_delalloc:
                        set_opt(sb, DELALLOC);
+                       set_opt2(sb, EXPLICIT_DELALLOC);
                        break;
                case Opt_block_validity:
                        set_opt(sb, BLOCK_VALIDITY);
@@ -1935,7 +1954,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                res = MS_RDONLY;
        }
        if (read_only)
-               return res;
+               goto done;
        if (!(sbi->s_mount_state & EXT4_VALID_FS))
                ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
                         "running e2fsck is recommended");
@@ -1966,6 +1985,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 
        ext4_commit_super(sb, 1);
+done:
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
@@ -2015,8 +2035,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
                flex_group = ext4_flex_group(sbi, i);
                atomic_add(ext4_free_inodes_count(sb, gdp),
                           &sbi->s_flex_groups[flex_group].free_inodes);
-               atomic_add(ext4_free_blks_count(sb, gdp),
-                          &sbi->s_flex_groups[flex_group].free_blocks);
+               atomic_add(ext4_free_group_clusters(sb, gdp),
+                          &sbi->s_flex_groups[flex_group].free_clusters);
                atomic_add(ext4_used_dirs_count(sb, gdp),
                           &sbi->s_flex_groups[flex_group].used_dirs);
        }
@@ -2134,7 +2154,8 @@ static int ext4_check_descriptors(struct super_block *sb,
        if (NULL != first_not_zeroed)
                *first_not_zeroed = grp;
 
-       ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
+       ext4_free_blocks_count_set(sbi->s_es,
+                                  EXT4_C2B(sbi, ext4_count_free_clusters(sb)));
        sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
        return 1;
 }
@@ -2454,7 +2475,8 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
                                              char *buf)
 {
        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                       (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+               (s64) EXT4_C2B(sbi,
+                       percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
 }
 
 static ssize_t session_write_kbytes_show(struct ext4_attr *a,
@@ -2682,6 +2704,13 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
                        return 0;
                }
        }
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
+           !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+               ext4_msg(sb, KERN_ERR,
+                        "Can't support bigalloc feature without "
+                        "extents feature\n");
+               return 0;
+       }
        return 1;
 }
 
@@ -3087,10 +3116,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        char *cp;
        const char *descr;
        int ret = -ENOMEM;
-       int blocksize;
+       int blocksize, clustersize;
        unsigned int db_count;
        unsigned int i;
-       int needs_recovery, has_huge_files;
+       int needs_recovery, has_huge_files, has_bigalloc;
        __u64 blocks_count;
        int err;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -3224,6 +3253,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                           &journal_ioprio, NULL, 0))
                goto failed_mount;
 
+       if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+               printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
+                           "with data=journal disables delayed "
+                           "allocation and O_DIRECT support!\n");
+               if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+                       ext4_msg(sb, KERN_ERR, "can't mount with "
+                                "both data=journal and delalloc");
+                       goto failed_mount;
+               }
+               if (test_opt(sb, DIOREAD_NOLOCK)) {
+                       ext4_msg(sb, KERN_ERR, "can't mount with "
+                                "both data=journal and delalloc");
+                       goto failed_mount;
+               }
+               if (test_opt(sb, DELALLOC))
+                       clear_opt(sb, DELALLOC);
+       }
+
+       blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+       if (test_opt(sb, DIOREAD_NOLOCK)) {
+               if (blocksize < PAGE_SIZE) {
+                       ext4_msg(sb, KERN_ERR, "can't mount with "
+                                "dioread_nolock if block size != PAGE_SIZE");
+                       goto failed_mount;
+               }
+       }
+
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
                (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
@@ -3265,8 +3321,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
                goto failed_mount;
 
-       blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
-
        if (blocksize < EXT4_MIN_BLOCK_SIZE ||
            blocksize > EXT4_MAX_BLOCK_SIZE) {
                ext4_msg(sb, KERN_ERR,
@@ -3369,12 +3423,53 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                sb->s_dirt = 1;
        }
 
-       if (sbi->s_blocks_per_group > blocksize * 8) {
-               ext4_msg(sb, KERN_ERR,
-                      "#blocks per group too big: %lu",
-                      sbi->s_blocks_per_group);
-               goto failed_mount;
+       /* Handle clustersize */
+       clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+       has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                               EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+       if (has_bigalloc) {
+               if (clustersize < blocksize) {
+                       ext4_msg(sb, KERN_ERR,
+                                "cluster size (%d) smaller than "
+                                "block size (%d)", clustersize, blocksize);
+                       goto failed_mount;
+               }
+               sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+                       le32_to_cpu(es->s_log_block_size);
+               sbi->s_clusters_per_group =
+                       le32_to_cpu(es->s_clusters_per_group);
+               if (sbi->s_clusters_per_group > blocksize * 8) {
+                       ext4_msg(sb, KERN_ERR,
+                                "#clusters per group too big: %lu",
+                                sbi->s_clusters_per_group);
+                       goto failed_mount;
+               }
+               if (sbi->s_blocks_per_group !=
+                   (sbi->s_clusters_per_group * (clustersize / blocksize))) {
+                       ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+                                "clusters per group (%lu) inconsistent",
+                                sbi->s_blocks_per_group,
+                                sbi->s_clusters_per_group);
+                       goto failed_mount;
+               }
+       } else {
+               if (clustersize != blocksize) {
+                       ext4_warning(sb, "fragment/cluster size (%d) != "
+                                    "block size (%d)", clustersize,
+                                    blocksize);
+                       clustersize = blocksize;
+               }
+               if (sbi->s_blocks_per_group > blocksize * 8) {
+                       ext4_msg(sb, KERN_ERR,
+                                "#blocks per group too big: %lu",
+                                sbi->s_blocks_per_group);
+                       goto failed_mount;
+               }
+               sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+               sbi->s_cluster_bits = 0;
        }
+       sbi->s_cluster_ratio = clustersize / blocksize;
+
        if (sbi->s_inodes_per_group > blocksize * 8) {
                ext4_msg(sb, KERN_ERR,
                       "#inodes per group too big: %lu",
@@ -3446,10 +3541,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
 
-#ifdef CONFIG_PROC_FS
        if (ext4_proc_root)
                sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-#endif
 
        bgl_lock_init(sbi->s_blockgroup_lock);
 
@@ -3483,8 +3576,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_err_report.function = print_daily_error_info;
        sbi->s_err_report.data = (unsigned long) sb;
 
-       err = percpu_counter_init(&sbi->s_freeblocks_counter,
-                       ext4_count_free_blocks(sb));
+       err = percpu_counter_init(&sbi->s_freeclusters_counter,
+                       ext4_count_free_clusters(sb));
        if (!err) {
                err = percpu_counter_init(&sbi->s_freeinodes_counter,
                                ext4_count_free_inodes(sb));
@@ -3494,7 +3587,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                ext4_count_dirs(sb));
        }
        if (!err) {
-               err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+               err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
        }
        if (err) {
                ext4_msg(sb, KERN_ERR, "insufficient memory");
@@ -3609,13 +3702,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * The journal may have updated the bg summary counts, so we
         * need to update the global counters.
         */
-       percpu_counter_set(&sbi->s_freeblocks_counter,
-                          ext4_count_free_blocks(sb));
+       percpu_counter_set(&sbi->s_freeclusters_counter,
+                          ext4_count_free_clusters(sb));
        percpu_counter_set(&sbi->s_freeinodes_counter,
                           ext4_count_free_inodes(sb));
        percpu_counter_set(&sbi->s_dirs_counter,
                           ext4_count_dirs(sb));
-       percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
+       percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
 
 no_journal:
        /*
@@ -3679,25 +3772,6 @@ no_journal:
                         "available");
        }
 
-       if (test_opt(sb, DELALLOC) &&
-           (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
-               ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
-                        "requested data journaling mode");
-               clear_opt(sb, DELALLOC);
-       }
-       if (test_opt(sb, DIOREAD_NOLOCK)) {
-               if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-                       ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
-                               "option - requested data journaling mode");
-                       clear_opt(sb, DIOREAD_NOLOCK);
-               }
-               if (sb->s_blocksize < PAGE_SIZE) {
-                       ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
-                               "option - block size is too small");
-                       clear_opt(sb, DIOREAD_NOLOCK);
-               }
-       }
-
        err = ext4_setup_system_zone(sb);
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -3710,22 +3784,19 @@ no_journal:
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
                         err);
-               goto failed_mount4;
+               goto failed_mount5;
        }
 
        err = ext4_register_li_request(sb, first_not_zeroed);
        if (err)
-               goto failed_mount4;
+               goto failed_mount6;
 
        sbi->s_kobj.kset = ext4_kset;
        init_completion(&sbi->s_kobj_unregister);
        err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
                                   "%s", sb->s_id);
-       if (err) {
-               ext4_mb_release(sb);
-               ext4_ext_release(sb);
-               goto failed_mount4;
-       };
+       if (err)
+               goto failed_mount7;
 
        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
        ext4_orphan_cleanup(sb, es);
@@ -3759,13 +3830,19 @@ cantfind_ext4:
                ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
        goto failed_mount;
 
+failed_mount7:
+       ext4_unregister_li_request(sb);
+failed_mount6:
+       ext4_ext_release(sb);
+failed_mount5:
+       ext4_mb_release(sb);
+       ext4_release_system_zone(sb);
 failed_mount4:
        iput(root);
        sb->s_root = NULL;
        ext4_msg(sb, KERN_ERR, "mount failed");
        destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
 failed_mount_wq:
-       ext4_release_system_zone(sb);
        if (sbi->s_journal) {
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
@@ -3774,10 +3851,10 @@ failed_mount3:
        del_timer(&sbi->s_err_report);
        if (sbi->s_flex_groups)
                ext4_kvfree(sbi->s_flex_groups);
-       percpu_counter_destroy(&sbi->s_freeblocks_counter);
+       percpu_counter_destroy(&sbi->s_freeclusters_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
-       percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+       percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
        if (sbi->s_mmp_tsk)
                kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
@@ -4064,7 +4141,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
        int error = 0;
 
-       if (!sbh)
+       if (!sbh || block_device_ejected(sb))
                return error;
        if (buffer_write_io_error(sbh)) {
                /*
@@ -4100,8 +4177,9 @@ static int ext4_commit_super(struct super_block *sb, int sync)
        else
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-       ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
-                                          &EXT4_SB(sb)->s_freeblocks_counter));
+       ext4_free_blocks_count_set(es,
+                       EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
+                               &EXT4_SB(sb)->s_freeclusters_counter)));
        es->s_free_inodes_count =
                cpu_to_le32(percpu_counter_sum_positive(
                                &EXT4_SB(sb)->s_freeinodes_counter));
@@ -4506,16 +4584,34 @@ restore_opts:
        return err;
 }
 
+/*
+ * Note: calculating the overhead so we can be compatible with
+ * historical BSD practice is quite difficult in the face of
+ * clusters/bigalloc.  This is because multiple metadata blocks from
+ * different block group can end up in the same allocation cluster.
+ * Calculating the exact overhead in the face of clustered allocation
+ * requires either O(all block bitmaps) in memory or O(number of block
+ * groups**2) in time.  We will still calculate the superblock for
+ * older file systems --- and if we come across with a bigalloc file
+ * system with zero in s_overhead_clusters the estimate will be close to
+ * correct especially for very large cluster sizes --- but for newer
+ * file systems, it's better to calculate this figure once at mkfs
+ * time, and store it in the superblock.  If the superblock value is
+ * present (even for non-bigalloc file systems), we will use it.
+ */
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
+       struct ext4_group_desc *gdp;
        u64 fsid;
        s64 bfree;
 
        if (test_opt(sb, MINIX_DF)) {
                sbi->s_overhead_last = 0;
+       } else if (es->s_overhead_clusters) {
+               sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters);
        } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
                ext4_group_t i, ngroups = ext4_get_groups_count(sb);
                ext4_fsblk_t overhead = 0;
@@ -4530,24 +4626,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
                 * All of the blocks before first_data_block are
                 * overhead
                 */
-               overhead = le32_to_cpu(es->s_first_data_block);
+               overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
 
                /*
-                * Add the overhead attributed to the superblock and
-                * block group descriptors.  If the sparse superblocks
-                * feature is turned on, then not all groups have this.
+                * Add the overhead found in each block group
                 */
                for (i = 0; i < ngroups; i++) {
-                       overhead += ext4_bg_has_super(sb, i) +
-                               ext4_bg_num_gdb(sb, i);
+                       gdp = ext4_get_group_desc(sb, i, NULL);
+                       overhead += ext4_num_overhead_clusters(sb, i, gdp);
                        cond_resched();
                }
-
-               /*
-                * Every block group has an inode bitmap, a block
-                * bitmap, and an inode table.
-                */
-               overhead += ngroups * (2 + sbi->s_itb_per_group);
                sbi->s_overhead_last = overhead;
                smp_wmb();
                sbi->s_blocks_last = ext4_blocks_count(es);
@@ -4555,11 +4643,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 
        buf->f_type = EXT4_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
-       buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-       bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
-                      percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+       buf->f_blocks = (ext4_blocks_count(es) -
+                        EXT4_C2B(sbi, sbi->s_overhead_last));
+       bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
+               percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
        /* prevent underflow in case that few free space is available */
-       buf->f_bfree = max_t(s64, bfree, 0);
+       buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
        buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
        if (buf->f_bfree < ext4_r_blocks_count(es))
                buf->f_bavail = 0;
@@ -4980,13 +5069,11 @@ static int __init ext4_init_fs(void)
                return err;
        err = ext4_init_system_zone();
        if (err)
-               goto out7;
+               goto out6;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
-               goto out6;
-       ext4_proc_root = proc_mkdir("fs/ext4", NULL);
-       if (!ext4_proc_root)
                goto out5;
+       ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 
        err = ext4_init_feat_adverts();
        if (err)
@@ -5022,12 +5109,12 @@ out2:
 out3:
        ext4_exit_feat_adverts();
 out4:
-       remove_proc_entry("fs/ext4", NULL);
-out5:
+       if (ext4_proc_root)
+               remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
-out6:
+out5:
        ext4_exit_system_zone();
-out7:
+out6:
        ext4_exit_pageio();
        return err;
 }
index c757adc..93a00d8 100644 (file)
@@ -820,8 +820,14 @@ inserted:
                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
 
+                       /*
+                        * take i_data_sem because we will test
+                        * i_delalloc_reserved_flag in ext4_mb_new_blocks
+                        */
+                       down_read((&EXT4_I(inode)->i_data_sem));
                        block = ext4_new_meta_blocks(handle, inode, goal, 0,
                                                     NULL, &error);
+                       up_read((&EXT4_I(inode)->i_data_sem));
                        if (error)
                                goto cleanup;
 
@@ -985,11 +991,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
        no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
        ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
 
-       error = ext4_get_inode_loc(inode, &is.iloc);
-       if (error)
-               goto cleanup;
-
-       error = ext4_journal_get_write_access(handle, is.iloc.bh);
+       error = ext4_reserve_inode_write(handle, inode, &is.iloc);
        if (error)
                goto cleanup;
 
index 9fe061f..fea8dd6 100644 (file)
@@ -1135,6 +1135,14 @@ static int journal_get_superblock(journal_t *journal)
                goto out;
        }
 
+       if (be32_to_cpu(sb->s_first) == 0 ||
+           be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+               printk(KERN_WARNING
+                       "JBD: Invalid start block of journal: %u\n",
+                       be32_to_cpu(sb->s_first));
+               goto out;
+       }
+
        return 0;
 
 out:
index eef6979..68d704d 100644 (file)
@@ -352,7 +352,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        J_ASSERT(commit_transaction->t_state == T_RUNNING);
 
        trace_jbd2_start_commit(journal, commit_transaction);
-       jbd_debug(1, "JBD: starting commit of transaction %d\n",
+       jbd_debug(1, "JBD2: starting commit of transaction %d\n",
                        commit_transaction->t_tid);
 
        write_lock(&journal->j_state_lock);
@@ -427,7 +427,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        __jbd2_journal_clean_checkpoint_list(journal);
        spin_unlock(&journal->j_list_lock);
 
-       jbd_debug (3, "JBD: commit phase 1\n");
+       jbd_debug(3, "JBD2: commit phase 1\n");
 
        /*
         * Switch to a new revoke table.
@@ -447,7 +447,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
        write_unlock(&journal->j_state_lock);
 
-       jbd_debug (3, "JBD: commit phase 2\n");
+       jbd_debug(3, "JBD2: commit phase 2\n");
 
        /*
         * Now start flushing things to disk, in the order they appear
@@ -462,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                                          WRITE_SYNC);
        blk_finish_plug(&plug);
 
-       jbd_debug(3, "JBD: commit phase 2\n");
+       jbd_debug(3, "JBD2: commit phase 2\n");
 
        /*
         * Way to go: we have now written out all of the data for a
@@ -522,7 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
                        J_ASSERT (bufs == 0);
 
-                       jbd_debug(4, "JBD: get descriptor\n");
+                       jbd_debug(4, "JBD2: get descriptor\n");
 
                        descriptor = jbd2_journal_get_descriptor_buffer(journal);
                        if (!descriptor) {
@@ -531,7 +531,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                        }
 
                        bh = jh2bh(descriptor);
-                       jbd_debug(4, "JBD: got buffer %llu (%p)\n",
+                       jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
                                (unsigned long long)bh->b_blocknr, bh->b_data);
                        header = (journal_header_t *)&bh->b_data[0];
                        header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
@@ -625,7 +625,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                    commit_transaction->t_buffers == NULL ||
                    space_left < tag_bytes + 16) {
 
-                       jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+                       jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
 
                        /* Write an end-of-descriptor marker before
                            submitting the IOs.  "tag" still points to
@@ -707,7 +707,7 @@ start_journal_io:
           so we incur less scheduling load.
        */
 
-       jbd_debug(3, "JBD: commit phase 3\n");
+       jbd_debug(3, "JBD2: commit phase 3\n");
 
        /*
         * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -771,7 +771,7 @@ wait_for_iobuf:
 
        J_ASSERT (commit_transaction->t_shadow_list == NULL);
 
-       jbd_debug(3, "JBD: commit phase 4\n");
+       jbd_debug(3, "JBD2: commit phase 4\n");
 
        /* Here we wait for the revoke record and descriptor record buffers */
  wait_for_ctlbuf:
@@ -801,7 +801,7 @@ wait_for_iobuf:
        if (err)
                jbd2_journal_abort(journal, err);
 
-       jbd_debug(3, "JBD: commit phase 5\n");
+       jbd_debug(3, "JBD2: commit phase 5\n");
        write_lock(&journal->j_state_lock);
        J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
        commit_transaction->t_state = T_COMMIT_JFLUSH;
@@ -830,7 +830,7 @@ wait_for_iobuf:
            transaction can be removed from any checkpoint list it was on
            before. */
 
-       jbd_debug(3, "JBD: commit phase 6\n");
+       jbd_debug(3, "JBD2: commit phase 6\n");
 
        J_ASSERT(list_empty(&commit_transaction->t_inode_list));
        J_ASSERT(commit_transaction->t_buffers == NULL);
@@ -964,7 +964,7 @@ restart_loop:
 
        /* Done with this transaction! */
 
-       jbd_debug(3, "JBD: commit phase 7\n");
+       jbd_debug(3, "JBD2: commit phase 7\n");
 
        J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
 
@@ -1039,7 +1039,7 @@ restart_loop:
                journal->j_commit_callback(journal, commit_transaction);
 
        trace_jbd2_end_commit(journal, commit_transaction);
-       jbd_debug(1, "JBD: commit %d complete, head %d\n",
+       jbd_debug(1, "JBD2: commit %d complete, head %d\n",
                  journal->j_commit_sequence, journal->j_tail_sequence);
        if (to_free)
                kfree(commit_transaction);
index f24df13..0fa0123 100644 (file)
@@ -491,7 +491,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
                 */
 
                journal->j_commit_request = target;
-               jbd_debug(1, "JBD: requesting commit %d/%d\n",
+               jbd_debug(1, "JBD2: requesting commit %d/%d\n",
                          journal->j_commit_request,
                          journal->j_commit_sequence);
                wake_up(&journal->j_wait_commit);
@@ -500,7 +500,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
                /* This should never happen, but if it does, preserve
                   the evidence before kjournald goes into a loop and
                   increments j_commit_sequence beyond all recognition. */
-               WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+               WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
                          journal->j_commit_request,
                          journal->j_commit_sequence,
                          target, journal->j_running_transaction ? 
@@ -645,7 +645,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
        }
 #endif
        while (tid_gt(tid, journal->j_commit_sequence)) {
-               jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
+               jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
                                  tid, journal->j_commit_sequence);
                wake_up(&journal->j_wait_commit);
                read_unlock(&journal->j_state_lock);
@@ -1093,7 +1093,7 @@ static int journal_reset(journal_t *journal)
        first = be32_to_cpu(sb->s_first);
        last = be32_to_cpu(sb->s_maxlen);
        if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
-               printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+               printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
                       first, last);
                journal_fail_superblock(journal);
                return -EINVAL;
@@ -1139,7 +1139,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
         */
        if (sb->s_start == 0 && journal->j_tail_sequence ==
                                journal->j_transaction_sequence) {
-               jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
+               jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
                        "(start %ld, seq %d, errno %d)\n",
                        journal->j_tail, journal->j_tail_sequence,
                        journal->j_errno);
@@ -1163,7 +1163,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
        }
 
        read_lock(&journal->j_state_lock);
-       jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+       jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n",
                  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
 
        sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1216,8 +1216,8 @@ static int journal_get_superblock(journal_t *journal)
                ll_rw_block(READ, 1, &bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
-                       printk (KERN_ERR
-                               "JBD: IO error reading journal superblock\n");
+                       printk(KERN_ERR
+                               "JBD2: IO error reading journal superblock\n");
                        goto out;
                }
        }
@@ -1228,7 +1228,7 @@ static int journal_get_superblock(journal_t *journal)
 
        if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
            sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
-               printk(KERN_WARNING "JBD: no valid journal superblock found\n");
+               printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
                goto out;
        }
 
@@ -1240,14 +1240,22 @@ static int journal_get_superblock(journal_t *journal)
                journal->j_format_version = 2;
                break;
        default:
-               printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
+               printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
                goto out;
        }
 
        if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
                journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
        else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
-               printk (KERN_WARNING "JBD: journal file too short\n");
+               printk(KERN_WARNING "JBD2: journal file too short\n");
+               goto out;
+       }
+
+       if (be32_to_cpu(sb->s_first) == 0 ||
+           be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+               printk(KERN_WARNING
+                       "JBD2: Invalid start block of journal: %u\n",
+                       be32_to_cpu(sb->s_first));
                goto out;
        }
 
@@ -1310,8 +1318,8 @@ int jbd2_journal_load(journal_t *journal)
                     ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
                    (sb->s_feature_incompat &
                     ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
-                       printk (KERN_WARNING
-                               "JBD: Unrecognised features on journal\n");
+                       printk(KERN_WARNING
+                               "JBD2: Unrecognised features on journal\n");
                        return -EINVAL;
                }
        }
@@ -1346,7 +1354,7 @@ int jbd2_journal_load(journal_t *journal)
        return 0;
 
 recovery_error:
-       printk (KERN_WARNING "JBD: recovery failed\n");
+       printk(KERN_WARNING "JBD2: recovery failed\n");
        return -EIO;
 }
 
@@ -1577,7 +1585,7 @@ static int journal_convert_superblock_v1(journal_t *journal,
        struct buffer_head *bh;
 
        printk(KERN_WARNING
-               "JBD: Converting superblock from version 1 to 2.\n");
+               "JBD2: Converting superblock from version 1 to 2.\n");
 
        /* Pre-initialise new fields to zero */
        offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
@@ -1694,7 +1702,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
        if (!journal->j_tail)
                goto no_recovery;
 
-       printk (KERN_WARNING "JBD: %s recovery information on journal\n",
+       printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
                write ? "Clearing" : "Ignoring");
 
        err = jbd2_journal_skip_recovery(journal);
@@ -2020,7 +2028,7 @@ static int journal_init_jbd2_journal_head_cache(void)
        retval = 0;
        if (!jbd2_journal_head_cache) {
                retval = -ENOMEM;
-               printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
+               printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
        }
        return retval;
 }
@@ -2383,7 +2391,7 @@ static void __exit journal_exit(void)
 #ifdef CONFIG_JBD2_DEBUG
        int n = atomic_read(&nr_journal_heads);
        if (n)
-               printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+               printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n);
 #endif
        jbd2_remove_debugfs_entry();
        jbd2_remove_jbd_stats_proc_entry();
index 1cad869..da6d7ba 100644 (file)
@@ -89,7 +89,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
                err = jbd2_journal_bmap(journal, next, &blocknr);
 
                if (err) {
-                       printk (KERN_ERR "JBD: bad block at offset %u\n",
+                       printk(KERN_ERR "JBD2: bad block at offset %u\n",
                                next);
                        goto failed;
                }
@@ -138,14 +138,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
        *bhp = NULL;
 
        if (offset >= journal->j_maxlen) {
-               printk(KERN_ERR "JBD: corrupted journal superblock\n");
+               printk(KERN_ERR "JBD2: corrupted journal superblock\n");
                return -EIO;
        }
 
        err = jbd2_journal_bmap(journal, offset, &blocknr);
 
        if (err) {
-               printk (KERN_ERR "JBD: bad block at offset %u\n",
+               printk(KERN_ERR "JBD2: bad block at offset %u\n",
                        offset);
                return err;
        }
@@ -163,7 +163,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
        }
 
        if (!buffer_uptodate(bh)) {
-               printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+               printk(KERN_ERR "JBD2: Failed to read block at offset %u\n",
                        offset);
                brelse(bh);
                return -EIO;
@@ -251,10 +251,10 @@ int jbd2_journal_recover(journal_t *journal)
        if (!err)
                err = do_one_pass(journal, &info, PASS_REPLAY);
 
-       jbd_debug(1, "JBD: recovery, exit status %d, "
+       jbd_debug(1, "JBD2: recovery, exit status %d, "
                  "recovered transactions %u to %u\n",
                  err, info.start_transaction, info.end_transaction);
-       jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
+       jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
                  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
 
        /* Restart the log at the next transaction ID, thus invalidating
@@ -293,14 +293,14 @@ int jbd2_journal_skip_recovery(journal_t *journal)
        err = do_one_pass(journal, &info, PASS_SCAN);
 
        if (err) {
-               printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+               printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
                ++journal->j_transaction_sequence;
        } else {
 #ifdef CONFIG_JBD2_DEBUG
                int dropped = info.end_transaction - 
                        be32_to_cpu(journal->j_superblock->s_sequence);
                jbd_debug(1,
-                         "JBD: ignoring %d transaction%s from the journal.\n",
+                         "JBD2: ignoring %d transaction%s from the journal.\n",
                          dropped, (dropped == 1) ? "" : "s");
 #endif
                journal->j_transaction_sequence = ++info.end_transaction;
@@ -338,7 +338,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
                wrap(journal, *next_log_block);
                err = jread(&obh, journal, io_block);
                if (err) {
-                       printk(KERN_ERR "JBD: IO error %d recovering block "
+                       printk(KERN_ERR "JBD2: IO error %d recovering block "
                                "%lu in log\n", err, io_block);
                        return 1;
                } else {
@@ -411,7 +411,7 @@ static int do_one_pass(journal_t *journal,
                 * either the next descriptor block or the final commit
                 * record. */
 
-               jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+               jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
                err = jread(&bh, journal, next_log_block);
                if (err)
                        goto failed;
@@ -491,8 +491,8 @@ static int do_one_pass(journal_t *journal,
                                        /* Recover what we can, but
                                         * report failure at the end. */
                                        success = err;
-                                       printk (KERN_ERR
-                                               "JBD: IO error %d recovering "
+                                       printk(KERN_ERR
+                                               "JBD2: IO error %d recovering "
                                                "block %ld in log\n",
                                                err, io_block);
                                } else {
@@ -520,7 +520,7 @@ static int do_one_pass(journal_t *journal,
                                                        journal->j_blocksize);
                                        if (nbh == NULL) {
                                                printk(KERN_ERR
-                                                      "JBD: Out of memory "
+                                                      "JBD2: Out of memory "
                                                       "during recovery.\n");
                                                err = -ENOMEM;
                                                brelse(bh);
@@ -689,7 +689,7 @@ static int do_one_pass(journal_t *journal,
                /* It's really bad news if different passes end up at
                 * different places (but possible due to IO errors). */
                if (info->end_transaction != next_commit_ID) {
-                       printk (KERN_ERR "JBD: recovery pass %d ended at "
+                       printk(KERN_ERR "JBD2: recovery pass %d ended at "
                                "transaction %u, expected %u\n",
                                pass, next_commit_ID, info->end_transaction);
                        if (!success)
index 2d71094..a0e41a4 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
 #include <linux/backing-dev.h>
+#include <linux/bug.h>
 #include <linux/module.h>
 
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
@@ -115,7 +116,7 @@ static inline void update_t_max_wait(transaction_t *transaction,
  */
 
 static int start_this_handle(journal_t *journal, handle_t *handle,
-                            int gfp_mask)
+                            gfp_t gfp_mask)
 {
        transaction_t   *transaction, *new_transaction = NULL;
        tid_t           tid;
@@ -124,7 +125,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
        unsigned long ts = jiffies;
 
        if (nblocks > journal->j_max_transaction_buffers) {
-               printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
+               printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
                       current->comm, nblocks,
                       journal->j_max_transaction_buffers);
                return -ENOSPC;
@@ -320,7 +321,7 @@ static handle_t *new_handle(int nblocks)
  * Return a pointer to a newly allocated handle, or an ERR_PTR() value
  * on failure.
  */
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)
 {
        handle_t *handle = journal_current_handle();
        int err;
@@ -443,7 +444,7 @@ out:
  * transaction capabable of guaranteeing the requested number of
  * credits.
  */
-int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
+int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
@@ -563,7 +564,7 @@ static void warn_dirty_buffer(struct buffer_head *bh)
        char b[BDEVNAME_SIZE];
 
        printk(KERN_WARNING
-              "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+              "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
               "There's a risk of filesystem corruption in case of system "
               "crash.\n",
               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
@@ -1049,6 +1050,10 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
  * mark dirty metadata which needs to be journaled as part of the current
  * transaction.
  *
+ * The buffer must have previously had jbd2_journal_get_write_access()
+ * called so that it has a valid journal_head attached to the buffer
+ * head.
+ *
  * The buffer is placed on the transaction's metadata list and is marked
  * as belonging to the transaction.
  *
@@ -1065,11 +1070,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
        struct journal_head *jh = bh2jh(bh);
+       int ret = 0;
 
        jbd_debug(5, "journal_head %p\n", jh);
        JBUFFER_TRACE(jh, "entry");
        if (is_handle_aborted(handle))
                goto out;
+       if (!buffer_jbd(bh)) {
+               ret = -EUCLEAN;
+               goto out;
+       }
 
        jbd_lock_bh_state(bh);
 
@@ -1093,8 +1103,20 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
         */
        if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
                JBUFFER_TRACE(jh, "fastpath");
-               J_ASSERT_JH(jh, jh->b_transaction ==
-                                       journal->j_running_transaction);
+               if (unlikely(jh->b_transaction !=
+                            journal->j_running_transaction)) {
+                       printk(KERN_EMERG "JBD: %s: "
+                              "jh->b_transaction (%llu, %p, %u) != "
+                              "journal->j_running_transaction (%p, %u)",
+                              journal->j_devname,
+                              (unsigned long long) bh->b_blocknr,
+                              jh->b_transaction,
+                              jh->b_transaction ? jh->b_transaction->t_tid : 0,
+                              journal->j_running_transaction,
+                              journal->j_running_transaction ?
+                              journal->j_running_transaction->t_tid : 0);
+                       ret = -EINVAL;
+               }
                goto out_unlock_bh;
        }
 
@@ -1108,9 +1130,32 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
         */
        if (jh->b_transaction != transaction) {
                JBUFFER_TRACE(jh, "already on other transaction");
-               J_ASSERT_JH(jh, jh->b_transaction ==
-                                       journal->j_committing_transaction);
-               J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+               if (unlikely(jh->b_transaction !=
+                            journal->j_committing_transaction)) {
+                       printk(KERN_EMERG "JBD: %s: "
+                              "jh->b_transaction (%llu, %p, %u) != "
+                              "journal->j_committing_transaction (%p, %u)",
+                              journal->j_devname,
+                              (unsigned long long) bh->b_blocknr,
+                              jh->b_transaction,
+                              jh->b_transaction ? jh->b_transaction->t_tid : 0,
+                              journal->j_committing_transaction,
+                              journal->j_committing_transaction ?
+                              journal->j_committing_transaction->t_tid : 0);
+                       ret = -EINVAL;
+               }
+               if (unlikely(jh->b_next_transaction != transaction)) {
+                       printk(KERN_EMERG "JBD: %s: "
+                              "jh->b_next_transaction (%llu, %p, %u) != "
+                              "transaction (%p, %u)",
+                              journal->j_devname,
+                              (unsigned long long) bh->b_blocknr,
+                              jh->b_next_transaction,
+                              jh->b_next_transaction ?
+                              jh->b_next_transaction->t_tid : 0,
+                              transaction, transaction->t_tid);
+                       ret = -EINVAL;
+               }
                /* And this case is illegal: we can't reuse another
                 * transaction's data buffer, ever. */
                goto out_unlock_bh;
@@ -1127,7 +1172,8 @@ out_unlock_bh:
        jbd_unlock_bh_state(bh);
 out:
        JBUFFER_TRACE(jh, "exit");
-       return 0;
+       WARN_ON(ret);   /* All errors are bugs, so dump the stack */
+       return ret;
 }
 
 /*
index 53792bf..ce1b719 100644 (file)
@@ -197,8 +197,8 @@ struct ext2_group_desc
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT2_FL_INHERITED (EXT2_SECRM_FL | EXT2_UNRM_FL | EXT2_COMPR_FL |\
-                          EXT2_SYNC_FL | EXT2_IMMUTABLE_FL | EXT2_APPEND_FL |\
-                          EXT2_NODUMP_FL | EXT2_NOATIME_FL | EXT2_COMPRBLK_FL|\
+                          EXT2_SYNC_FL | EXT2_NODUMP_FL |\
+                          EXT2_NOATIME_FL | EXT2_COMPRBLK_FL |\
                           EXT2_NOCOMP_FL | EXT2_JOURNAL_DATA_FL |\
                           EXT2_NOTAIL_FL | EXT2_DIRSYNC_FL)
 
index f5fceff..dec9911 100644 (file)
@@ -180,8 +180,8 @@ struct ext3_group_desc
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
-                          EXT3_SYNC_FL | EXT3_IMMUTABLE_FL | EXT3_APPEND_FL |\
-                          EXT3_NODUMP_FL | EXT3_NOATIME_FL | EXT3_COMPRBLK_FL|\
+                          EXT3_SYNC_FL | EXT3_NODUMP_FL |\
+                          EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\
                           EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
                           EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
 
index 7a049fd..78af938 100644 (file)
@@ -770,12 +770,13 @@ struct inode {
        unsigned long           i_ino;
        unsigned int            i_nlink;
        dev_t                   i_rdev;
-       loff_t                  i_size;
        struct timespec         i_atime;
        struct timespec         i_mtime;
        struct timespec         i_ctime;
-       unsigned int            i_blkbits;
+       spinlock_t              i_lock; /* i_blocks, i_bytes, maybe i_size */
+       unsigned short          i_bytes;
        blkcnt_t                i_blocks;
+       loff_t                  i_size;
 
 #ifdef __NEED_I_SIZE_ORDERED
        seqcount_t              i_size_seqcount;
@@ -783,7 +784,6 @@ struct inode {
 
        /* Misc */
        unsigned long           i_state;
-       spinlock_t              i_lock; /* i_blocks, i_bytes, maybe i_size */
        struct mutex            i_mutex;
 
        unsigned long           dirtied_when;   /* jiffies of first dirtying */
@@ -797,9 +797,10 @@ struct inode {
                struct rcu_head         i_rcu;
        };
        atomic_t                i_count;
+       unsigned int            i_blkbits;
        u64                     i_version;
-       unsigned short          i_bytes;
        atomic_t                i_dio_count;
+       atomic_t                i_writecount;
        const struct file_operations    *i_fop; /* former ->i_op->default_file_ops */
        struct file_lock        *i_flock;
        struct address_space    i_data;
@@ -823,7 +824,6 @@ struct inode {
 #ifdef CONFIG_IMA
        atomic_t                i_readcount; /* struct files open RO */
 #endif
-       atomic_t                i_writecount;
        void                    *i_private; /* fs or device private pointer */
 };
 
index e6a5e34..c7acdde 100644 (file)
@@ -244,6 +244,7 @@ typedef struct journal_superblock_s
 
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/jbd_common.h>
 
 #define J_ASSERT(assert)       BUG_ON(!(assert))
 
@@ -270,69 +271,6 @@ typedef struct journal_superblock_s
 #define J_EXPECT_JH(jh, expr, why...)  __journal_expect(expr, ## why)
 #endif
 
-enum jbd_state_bits {
-       BH_JBD                  /* Has an attached ext3 journal_head */
-         = BH_PrivateStart,
-       BH_JWrite,              /* Being written to log (@@@ DEBUGGING) */
-       BH_Freed,               /* Has been freed (truncated) */
-       BH_Revoked,             /* Has been revoked from the log */
-       BH_RevokeValid,         /* Revoked flag is valid */
-       BH_JBDDirty,            /* Is dirty but journaled */
-       BH_State,               /* Pins most journal_head state */
-       BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
-       BH_Unshadow,            /* Dummy bit, for BJ_Shadow wakeup filtering */
-};
-
-BUFFER_FNS(JBD, jbd)
-BUFFER_FNS(JWrite, jwrite)
-BUFFER_FNS(JBDDirty, jbddirty)
-TAS_BUFFER_FNS(JBDDirty, jbddirty)
-BUFFER_FNS(Revoked, revoked)
-TAS_BUFFER_FNS(Revoked, revoked)
-BUFFER_FNS(RevokeValid, revokevalid)
-TAS_BUFFER_FNS(RevokeValid, revokevalid)
-BUFFER_FNS(Freed, freed)
-
-static inline struct buffer_head *jh2bh(struct journal_head *jh)
-{
-       return jh->b_bh;
-}
-
-static inline struct journal_head *bh2jh(struct buffer_head *bh)
-{
-       return bh->b_private;
-}
-
-static inline void jbd_lock_bh_state(struct buffer_head *bh)
-{
-       bit_spin_lock(BH_State, &bh->b_state);
-}
-
-static inline int jbd_trylock_bh_state(struct buffer_head *bh)
-{
-       return bit_spin_trylock(BH_State, &bh->b_state);
-}
-
-static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
-{
-       return bit_spin_is_locked(BH_State, &bh->b_state);
-}
-
-static inline void jbd_unlock_bh_state(struct buffer_head *bh)
-{
-       bit_spin_unlock(BH_State, &bh->b_state);
-}
-
-static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
-{
-       bit_spin_lock(BH_JournalHead, &bh->b_state);
-}
-
-static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
-{
-       bit_spin_unlock(BH_JournalHead, &bh->b_state);
-}
-
 struct jbd_revoke_table_s;
 
 /**
index 38f307b..2092ea2 100644 (file)
@@ -275,6 +275,7 @@ typedef struct journal_superblock_s
 
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/jbd_common.h>
 
 #define J_ASSERT(assert)       BUG_ON(!(assert))
 
@@ -302,70 +303,6 @@ typedef struct journal_superblock_s
 #define J_EXPECT_JH(jh, expr, why...)  __journal_expect(expr, ## why)
 #endif
 
-enum jbd_state_bits {
-       BH_JBD                  /* Has an attached ext3 journal_head */
-         = BH_PrivateStart,
-       BH_JWrite,              /* Being written to log (@@@ DEBUGGING) */
-       BH_Freed,               /* Has been freed (truncated) */
-       BH_Revoked,             /* Has been revoked from the log */
-       BH_RevokeValid,         /* Revoked flag is valid */
-       BH_JBDDirty,            /* Is dirty but journaled */
-       BH_State,               /* Pins most journal_head state */
-       BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
-       BH_Unshadow,            /* Dummy bit, for BJ_Shadow wakeup filtering */
-       BH_JBDPrivateStart,     /* First bit available for private use by FS */
-};
-
-BUFFER_FNS(JBD, jbd)
-BUFFER_FNS(JWrite, jwrite)
-BUFFER_FNS(JBDDirty, jbddirty)
-TAS_BUFFER_FNS(JBDDirty, jbddirty)
-BUFFER_FNS(Revoked, revoked)
-TAS_BUFFER_FNS(Revoked, revoked)
-BUFFER_FNS(RevokeValid, revokevalid)
-TAS_BUFFER_FNS(RevokeValid, revokevalid)
-BUFFER_FNS(Freed, freed)
-
-static inline struct buffer_head *jh2bh(struct journal_head *jh)
-{
-       return jh->b_bh;
-}
-
-static inline struct journal_head *bh2jh(struct buffer_head *bh)
-{
-       return bh->b_private;
-}
-
-static inline void jbd_lock_bh_state(struct buffer_head *bh)
-{
-       bit_spin_lock(BH_State, &bh->b_state);
-}
-
-static inline int jbd_trylock_bh_state(struct buffer_head *bh)
-{
-       return bit_spin_trylock(BH_State, &bh->b_state);
-}
-
-static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
-{
-       return bit_spin_is_locked(BH_State, &bh->b_state);
-}
-
-static inline void jbd_unlock_bh_state(struct buffer_head *bh)
-{
-       bit_spin_unlock(BH_State, &bh->b_state);
-}
-
-static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
-{
-       bit_spin_lock(BH_JournalHead, &bh->b_state);
-}
-
-static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
-{
-       bit_spin_unlock(BH_JournalHead, &bh->b_state);
-}
-
 /* Flags in jbd_inode->i_flags */
 #define __JI_COMMIT_RUNNING 0
 /* Commit of the inode data in progress. We use this flag to protect us from
@@ -1106,9 +1043,9 @@ static inline handle_t *journal_current_handle(void)
  */
 
 extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
-extern handle_t *jbd2__journal_start(journal_t *, int nblocks, int gfp_mask);
+extern handle_t *jbd2__journal_start(journal_t *, int nblocks, gfp_t gfp_mask);
 extern int      jbd2_journal_restart(handle_t *, int nblocks);
-extern int      jbd2__journal_restart(handle_t *, int nblocks, int gfp_mask);
+extern int      jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask);
 extern int      jbd2_journal_extend (handle_t *, int nblocks);
 extern int      jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
 extern int      jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
new file mode 100644 (file)
index 0000000..6230f85
--- /dev/null
@@ -0,0 +1,68 @@
+#ifndef _LINUX_JBD_STATE_H
+#define _LINUX_JBD_STATE_H
+
+enum jbd_state_bits {
+       BH_JBD                  /* Has an attached ext3 journal_head */
+         = BH_PrivateStart,
+       BH_JWrite,              /* Being written to log (@@@ DEBUGGING) */
+       BH_Freed,               /* Has been freed (truncated) */
+       BH_Revoked,             /* Has been revoked from the log */
+       BH_RevokeValid,         /* Revoked flag is valid */
+       BH_JBDDirty,            /* Is dirty but journaled */
+       BH_State,               /* Pins most journal_head state */
+       BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
+       BH_Unshadow,            /* Dummy bit, for BJ_Shadow wakeup filtering */
+       BH_JBDPrivateStart,     /* First bit available for private use by FS */
+};
+
+BUFFER_FNS(JBD, jbd)
+BUFFER_FNS(JWrite, jwrite)
+BUFFER_FNS(JBDDirty, jbddirty)
+TAS_BUFFER_FNS(JBDDirty, jbddirty)
+BUFFER_FNS(Revoked, revoked)
+TAS_BUFFER_FNS(Revoked, revoked)
+BUFFER_FNS(RevokeValid, revokevalid)
+TAS_BUFFER_FNS(RevokeValid, revokevalid)
+BUFFER_FNS(Freed, freed)
+
+static inline struct buffer_head *jh2bh(struct journal_head *jh)
+{
+       return jh->b_bh;
+}
+
+static inline struct journal_head *bh2jh(struct buffer_head *bh)
+{
+       return bh->b_private;
+}
+
+static inline void jbd_lock_bh_state(struct buffer_head *bh)
+{
+       bit_spin_lock(BH_State, &bh->b_state);
+}
+
+static inline int jbd_trylock_bh_state(struct buffer_head *bh)
+{
+       return bit_spin_trylock(BH_State, &bh->b_state);
+}
+
+static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
+{
+       return bit_spin_is_locked(BH_State, &bh->b_state);
+}
+
+static inline void jbd_unlock_bh_state(struct buffer_head *bh)
+{
+       bit_spin_unlock(BH_State, &bh->b_state);
+}
+
+static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
+{
+       bit_spin_lock(BH_JournalHead, &bh->b_state);
+}
+
+static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
+{
+       bit_spin_unlock(BH_JournalHead, &bh->b_state);
+}
+
+#endif
index b50a547..748ff7c 100644 (file)
@@ -9,9 +9,12 @@
 
 struct ext4_allocation_context;
 struct ext4_allocation_request;
+struct ext4_extent;
 struct ext4_prealloc_space;
 struct ext4_inode_info;
 struct mpage_da_data;
+struct ext4_map_blocks;
+struct ext4_extent;
 
 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
 
@@ -1032,9 +1035,9 @@ TRACE_EVENT(ext4_forget,
 );
 
 TRACE_EVENT(ext4_da_update_reserve_space,
-       TP_PROTO(struct inode *inode, int used_blocks),
+       TP_PROTO(struct inode *inode, int used_blocks, int quota_claim),
 
-       TP_ARGS(inode, used_blocks),
+       TP_ARGS(inode, used_blocks, quota_claim),
 
        TP_STRUCT__entry(
                __field(        dev_t,  dev                     )
@@ -1045,6 +1048,7 @@ TRACE_EVENT(ext4_da_update_reserve_space,
                __field(        int,    reserved_data_blocks    )
                __field(        int,    reserved_meta_blocks    )
                __field(        int,    allocated_meta_blocks   )
+               __field(        int,    quota_claim             )
        ),
 
        TP_fast_assign(
@@ -1053,19 +1057,24 @@ TRACE_EVENT(ext4_da_update_reserve_space,
                __entry->mode   = inode->i_mode;
                __entry->i_blocks = inode->i_blocks;
                __entry->used_blocks = used_blocks;
-               __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-               __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
-               __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
+               __entry->reserved_data_blocks =
+                               EXT4_I(inode)->i_reserved_data_blocks;
+               __entry->reserved_meta_blocks =
+                               EXT4_I(inode)->i_reserved_meta_blocks;
+               __entry->allocated_meta_blocks =
+                               EXT4_I(inode)->i_allocated_meta_blocks;
+               __entry->quota_claim = quota_claim;
        ),
 
        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d "
                  "reserved_data_blocks %d reserved_meta_blocks %d "
-                 "allocated_meta_blocks %d",
+                 "allocated_meta_blocks %d quota_claim %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->used_blocks, __entry->reserved_data_blocks,
-                 __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
+                 __entry->reserved_meta_blocks, __entry->allocated_meta_blocks,
+                 __entry->quota_claim)
 );
 
 TRACE_EVENT(ext4_da_reserve_space,
@@ -1386,6 +1395,87 @@ DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,
        TP_ARGS(inode)
 );
 
+/* 'ux' is the uninitialized extent. */
+TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
+       TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+                struct ext4_extent *ux),
+
+       TP_ARGS(inode, map, ux),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino     )
+               __field(        dev_t,          dev     )
+               __field(        ext4_lblk_t,    m_lblk  )
+               __field(        unsigned,       m_len   )
+               __field(        ext4_lblk_t,    u_lblk  )
+               __field(        unsigned,       u_len   )
+               __field(        ext4_fsblk_t,   u_pblk  )
+       ),
+
+       TP_fast_assign(
+               __entry->ino            = inode->i_ino;
+               __entry->dev            = inode->i_sb->s_dev;
+               __entry->m_lblk         = map->m_lblk;
+               __entry->m_len          = map->m_len;
+               __entry->u_lblk         = le32_to_cpu(ux->ee_block);
+               __entry->u_len          = ext4_ext_get_actual_len(ux);
+               __entry->u_pblk         = ext4_ext_pblock(ux);
+       ),
+
+       TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u "
+                 "u_pblk %llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 __entry->m_lblk, __entry->m_len,
+                 __entry->u_lblk, __entry->u_len, __entry->u_pblk)
+);
+
+/*
+ * 'ux' is the uninitialized extent.
+ * 'ix' is the initialized extent to which blocks are transferred.
+ */
+TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
+       TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+                struct ext4_extent *ux, struct ext4_extent *ix),
+
+       TP_ARGS(inode, map, ux, ix),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino     )
+               __field(        dev_t,          dev     )
+               __field(        ext4_lblk_t,    m_lblk  )
+               __field(        unsigned,       m_len   )
+               __field(        ext4_lblk_t,    u_lblk  )
+               __field(        unsigned,       u_len   )
+               __field(        ext4_fsblk_t,   u_pblk  )
+               __field(        ext4_lblk_t,    i_lblk  )
+               __field(        unsigned,       i_len   )
+               __field(        ext4_fsblk_t,   i_pblk  )
+       ),
+
+       TP_fast_assign(
+               __entry->ino            = inode->i_ino;
+               __entry->dev            = inode->i_sb->s_dev;
+               __entry->m_lblk         = map->m_lblk;
+               __entry->m_len          = map->m_len;
+               __entry->u_lblk         = le32_to_cpu(ux->ee_block);
+               __entry->u_len          = ext4_ext_get_actual_len(ux);
+               __entry->u_pblk         = ext4_ext_pblock(ux);
+               __entry->i_lblk         = le32_to_cpu(ix->ee_block);
+               __entry->i_len          = ext4_ext_get_actual_len(ix);
+               __entry->i_pblk         = ext4_ext_pblock(ix);
+       ),
+
+       TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u "
+                 "u_lblk %u u_len %u u_pblk %llu "
+                 "i_lblk %u i_len %u i_pblk %llu ",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 __entry->m_lblk, __entry->m_len,
+                 __entry->u_lblk, __entry->u_len, __entry->u_pblk,
+                 __entry->i_lblk, __entry->i_len, __entry->i_pblk)
+);
+
 DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned int len, unsigned int flags),
@@ -1589,6 +1679,382 @@ DEFINE_EVENT(ext4__trim, ext4_trim_all_free,
        TP_ARGS(sb, group, start, len)
 );
 
+TRACE_EVENT(ext4_ext_handle_uninitialized_extents,
+       TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+                unsigned int allocated, ext4_fsblk_t newblock),
+
+       TP_ARGS(inode, map, allocated, newblock),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino             )
+               __field(        dev_t,          dev             )
+               __field(        ext4_lblk_t,    lblk            )
+               __field(        ext4_fsblk_t,   pblk            )
+               __field(        unsigned int,   len             )
+               __field(        int,            flags           )
+               __field(        unsigned int,   allocated       )
+               __field(        ext4_fsblk_t,   newblk          )
+       ),
+
+       TP_fast_assign(
+               __entry->ino            = inode->i_ino;
+               __entry->dev            = inode->i_sb->s_dev;
+               __entry->lblk           = map->m_lblk;
+               __entry->pblk           = map->m_pblk;
+               __entry->len            = map->m_len;
+               __entry->flags          = map->m_flags;
+               __entry->allocated      = allocated;
+               __entry->newblk         = newblock;
+       ),
+
+       TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %d"
+                 "allocated %d newblock %llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
+                 __entry->len, __entry->flags,
+                 (unsigned int) __entry->allocated,
+                 (unsigned long long) __entry->newblk)
+);
+
+TRACE_EVENT(ext4_get_implied_cluster_alloc_exit,
+       TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret),
+
+       TP_ARGS(sb, map, ret),
+
+       TP_STRUCT__entry(
+               __field(        dev_t,          dev     )
+               __field(        ext4_lblk_t,    lblk    )
+               __field(        ext4_fsblk_t,   pblk    )
+               __field(        unsigned int,   len     )
+               __field(        unsigned int,   flags   )
+               __field(        int,            ret     )
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = sb->s_dev;
+               __entry->lblk   = map->m_lblk;
+               __entry->pblk   = map->m_pblk;
+               __entry->len    = map->m_len;
+               __entry->flags  = map->m_flags;
+               __entry->ret    = ret;
+       ),
+
+       TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %u ret %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->lblk, (unsigned long long) __entry->pblk,
+                 __entry->len, __entry->flags, __entry->ret)
+);
+
+TRACE_EVENT(ext4_ext_put_in_cache,
+       TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len,
+                ext4_fsblk_t start),
+
+       TP_ARGS(inode, lblk, len, start),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino     )
+               __field(        dev_t,          dev     )
+               __field(        ext4_lblk_t,    lblk    )
+               __field(        unsigned int,   len     )
+               __field(        ext4_fsblk_t,   start   )
+       ),
+
+       TP_fast_assign(
+               __entry->ino    = inode->i_ino;
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->lblk   = lblk;
+               __entry->len    = len;
+               __entry->start  = start;
+       ),
+
+       TP_printk("dev %d,%d ino %lu lblk %u len %u start %llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 (unsigned) __entry->lblk,
+                 __entry->len,
+                 (unsigned long long) __entry->start)
+);
+
+TRACE_EVENT(ext4_ext_in_cache,
+       TP_PROTO(struct inode *inode, ext4_lblk_t lblk, int ret),
+
+       TP_ARGS(inode, lblk, ret),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino     )
+               __field(        dev_t,          dev     )
+               __field(        ext4_lblk_t,    lblk    )
+               __field(        int,            ret     )
+       ),
+
+       TP_fast_assign(
+               __entry->ino    = inode->i_ino;
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->lblk   = lblk;
+               __entry->ret    = ret;
+       ),
+
+       TP_printk("dev %d,%d ino %lu lblk %u ret %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 (unsigned) __entry->lblk,
+                 __entry->ret)
+
+);
+
+TRACE_EVENT(ext4_find_delalloc_range,
+       TP_PROTO(struct inode *inode, ext4_lblk_t from, ext4_lblk_t to,
+               int reverse, int found, ext4_lblk_t found_blk),
+
+       TP_ARGS(inode, from, to, reverse, found, found_blk),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino             )
+               __field(        dev_t,          dev             )
+               __field(        ext4_lblk_t,    from            )
+               __field(        ext4_lblk_t,    to              )
+               __field(        int,            reverse         )
+               __field(        int,            found           )
+               __field(        ext4_lblk_t,    found_blk       )
+       ),
+
+       TP_fast_assign(
+               __entry->ino            = inode->i_ino;
+               __entry->dev            = inode->i_sb->s_dev;
+               __entry->from           = from;
+               __entry->to             = to;
+               __entry->reverse        = reverse;
+               __entry->found          = found;
+               __entry->found_blk      = found_blk;
+       ),
+
+       TP_printk("dev %d,%d ino %lu from %u to %u reverse %d found %d "
+                 "(blk = %u)",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 (unsigned) __entry->from, (unsigned) __entry->to,
+                 __entry->reverse, __entry->found,
+                 (unsigned) __entry->found_blk)
+);
+
+TRACE_EVENT(ext4_get_reserved_cluster_alloc,
+       TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len),
+
+       TP_ARGS(inode, lblk, len),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino     )
+               __field(        dev_t,          dev     )
+               __field(        ext4_lblk_t,    lblk    )
+               __field(        unsigned int,   len     )
+       ),
+
+       TP_fast_assign(
+               __entry->ino    = inode->i_ino;
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->lblk   = lblk;
+               __entry->len    = len;
+       ),
+
+       TP_printk("dev %d,%d ino %lu lblk %u len %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 (unsigned) __entry->lblk,
+                 __entry->len)
+);
+
+TRACE_EVENT(ext4_ext_show_extent,
+       TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
+                unsigned short len),
+
+       TP_ARGS(inode, lblk, pblk, len),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino     )
+               __field(        dev_t,          dev     )
+               __field(        ext4_lblk_t,    lblk    )
+               __field(        ext4_fsblk_t,   pblk    )
+               __field(        unsigned short, len     )
+       ),
+
+       TP_fast_assign(
+               __entry->ino    = inode->i_ino;
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->lblk   = lblk;
+               __entry->pblk   = pblk;
+               __entry->len    = len;
+       ),
+
+       TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 (unsigned) __entry->lblk,
+                 (unsigned long long) __entry->pblk,
+                 (unsigned short) __entry->len)
+);
+
+TRACE_EVENT(ext4_remove_blocks,
+           TP_PROTO(struct inode *inode, struct ext4_extent *ex,
+               ext4_lblk_t from, ext4_fsblk_t to,
+               ext4_fsblk_t partial_cluster),
+
+       TP_ARGS(inode, ex, from, to, partial_cluster),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino     )
+               __field(        dev_t,          dev     )
+               __field(        ext4_lblk_t,    ee_lblk )
+               __field(        ext4_fsblk_t,   ee_pblk )
+               __field(        unsigned short, ee_len  )
+               __field(        ext4_lblk_t,    from    )
+               __field(        ext4_lblk_t,    to      )
+               __field(        ext4_fsblk_t,   partial )
+       ),
+
+       TP_fast_assign(
+               __entry->ino            = inode->i_ino;
+               __entry->dev            = inode->i_sb->s_dev;
+               __entry->ee_lblk        = cpu_to_le32(ex->ee_block);
+               __entry->ee_pblk        = ext4_ext_pblock(ex);
+               __entry->ee_len         = ext4_ext_get_actual_len(ex);
+               __entry->from           = from;
+               __entry->to             = to;
+               __entry->partial        = partial_cluster;
+       ),
+
+       TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
+                 "from %u to %u partial_cluster %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 (unsigned) __entry->ee_lblk,
+                 (unsigned long long) __entry->ee_pblk,
+                 (unsigned short) __entry->ee_len,
+                 (unsigned) __entry->from,
+                 (unsigned) __entry->to,
+                 (unsigned) __entry->partial)
+);
+
+TRACE_EVENT(ext4_ext_rm_leaf,
+       TP_PROTO(struct inode *inode, ext4_lblk_t start,
+                struct ext4_extent *ex, ext4_fsblk_t partial_cluster),
+
+       TP_ARGS(inode, start, ex, partial_cluster),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino     )
+               __field(        dev_t,          dev     )
+               __field(        ext4_lblk_t,    start   )
+               __field(        ext4_lblk_t,    ee_lblk )
+               __field(        ext4_fsblk_t,   ee_pblk )
+               __field(        short,          ee_len  )
+               __field(        ext4_fsblk_t,   partial )
+       ),
+
+       TP_fast_assign(
+               __entry->ino            = inode->i_ino;
+               __entry->dev            = inode->i_sb->s_dev;
+               __entry->start          = start;
+               __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
+               __entry->ee_pblk        = ext4_ext_pblock(ex);
+               __entry->ee_len         = ext4_ext_get_actual_len(ex);
+               __entry->partial        = partial_cluster;
+       ),
+
+       TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
+                 "partial_cluster %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 (unsigned) __entry->start,
+                 (unsigned) __entry->ee_lblk,
+                 (unsigned long long) __entry->ee_pblk,
+                 (unsigned short) __entry->ee_len,
+                 (unsigned) __entry->partial)
+);
+
+TRACE_EVENT(ext4_ext_rm_idx,
+       TP_PROTO(struct inode *inode, ext4_fsblk_t pblk),
+
+       TP_ARGS(inode, pblk),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino     )
+               __field(        dev_t,          dev     )
+               __field(        ext4_fsblk_t,   pblk    )
+       ),
+
+       TP_fast_assign(
+               __entry->ino    = inode->i_ino;
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->pblk   = pblk;
+       ),
+
+       TP_printk("dev %d,%d ino %lu index_pblk %llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 (unsigned long long) __entry->pblk)
+);
+
+TRACE_EVENT(ext4_ext_remove_space,
+       TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth),
+
+       TP_ARGS(inode, start, depth),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino     )
+               __field(        dev_t,          dev     )
+               __field(        ext4_lblk_t,    start   )
+               __field(        int,            depth   )
+       ),
+
+       TP_fast_assign(
+               __entry->ino    = inode->i_ino;
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->start  = start;
+               __entry->depth  = depth;
+       ),
+
+       TP_printk("dev %d,%d ino %lu since %u depth %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 (unsigned) __entry->start,
+                 __entry->depth)
+);
+
+TRACE_EVENT(ext4_ext_remove_space_done,
+       TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth,
+               ext4_lblk_t partial, unsigned short eh_entries),
+
+       TP_ARGS(inode, start, depth, partial, eh_entries),
+
+       TP_STRUCT__entry(
+               __field(        ino_t,          ino             )
+               __field(        dev_t,          dev             )
+               __field(        ext4_lblk_t,    start           )
+               __field(        int,            depth           )
+               __field(        ext4_lblk_t,    partial         )
+               __field(        unsigned short, eh_entries      )
+       ),
+
+       TP_fast_assign(
+               __entry->ino            = inode->i_ino;
+               __entry->dev            = inode->i_sb->s_dev;
+               __entry->start          = start;
+               __entry->depth          = depth;
+               __entry->partial        = partial;
+               __entry->eh_entries     = eh_entries;
+       ),
+
+       TP_printk("dev %d,%d ino %lu since %u depth %d partial %u "
+                 "remaining_entries %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 (unsigned) __entry->start,
+                 __entry->depth,
+                 (unsigned) __entry->partial,
+                 (unsigned short) __entry->eh_entries)
+);
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */