Merge branch 'next' into upstream-merge
authorTheodore Ts'o <tytso@mit.edu>
Thu, 28 Oct 2010 03:44:47 +0000 (23:44 -0400)
committerTheodore Ts'o <tytso@mit.edu>
Thu, 28 Oct 2010 03:44:47 +0000 (23:44 -0400)
Conflicts:
fs/ext4/inode.c
fs/ext4/mballoc.c
include/trace/events/ext4.h

15 files changed:
1  2 
fs/ext4/extents.c
fs/ext4/fsync.c
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/mballoc.c
fs/ext4/namei.c
fs/ext4/resize.c
fs/ext4/super.c
fs/jbd2/checkpoint.c
fs/jbd2/commit.c
fs/jbd2/journal.c
include/linux/blkdev.h
include/linux/fs.h
include/linux/writeback.h
include/trace/events/ext4.h

diff --combined fs/ext4/extents.c
  #include "ext4_jbd2.h"
  #include "ext4_extents.h"
  
- /*
-  * ext_pblock:
-  * combine low and high parts of physical block number into ext4_fsblk_t
-  */
- ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
- {
-       ext4_fsblk_t block;
-       block = le32_to_cpu(ex->ee_start_lo);
-       block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
-       return block;
- }
- /*
-  * idx_pblock:
-  * combine low and high parts of a leaf physical block number into ext4_fsblk_t
-  */
- ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
- {
-       ext4_fsblk_t block;
-       block = le32_to_cpu(ix->ei_leaf_lo);
-       block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
-       return block;
- }
- /*
-  * ext4_ext_store_pblock:
-  * stores a large physical block number into an extent struct,
-  * breaking it into parts
-  */
- void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
- {
-       ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-       ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
- }
- /*
-  * ext4_idx_store_pblock:
-  * stores a large physical block number into an index struct,
-  * breaking it into parts
-  */
- static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
- {
-       ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-       ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
- }
  static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
                                            int needed)
@@@ -169,7 -120,8 +120,8 @@@ static ext4_fsblk_t ext4_ext_find_goal(
                /* try to predict block placement */
                ex = path[depth].p_ext;
                if (ex)
-                       return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+                       return (ext4_ext_pblock(ex) +
+                               (block - le32_to_cpu(ex->ee_block)));
  
                /* it looks like index is empty;
                 * try to find starting block from index itself */
@@@ -354,7 -306,7 +306,7 @@@ ext4_ext_max_entries(struct inode *inod
  
  static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
  {
-       ext4_fsblk_t block = ext_pblock(ext);
+       ext4_fsblk_t block = ext4_ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
  
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
  static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
  {
-       ext4_fsblk_t block = idx_pblock(ext_idx);
+       ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
  
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
  }
@@@ -463,13 -415,13 +415,13 @@@ static void ext4_ext_show_path(struct i
        for (k = 0; k <= l; k++, path++) {
                if (path->p_idx) {
                  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
-                           idx_pblock(path->p_idx));
+                           ext4_idx_pblock(path->p_idx));
                } else if (path->p_ext) {
                        ext_debug("  %d:[%d]%d:%llu ",
                                  le32_to_cpu(path->p_ext->ee_block),
                                  ext4_ext_is_uninitialized(path->p_ext),
                                  ext4_ext_get_actual_len(path->p_ext),
-                                 ext_pblock(path->p_ext));
+                                 ext4_ext_pblock(path->p_ext));
                } else
                        ext_debug("  []");
        }
@@@ -494,7 -446,7 +446,7 @@@ static void ext4_ext_show_leaf(struct i
        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
                ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
                          ext4_ext_is_uninitialized(ex),
-                         ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                         ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
        }
        ext_debug("\n");
  }
@@@ -545,7 -497,7 +497,7 @@@ ext4_ext_binsearch_idx(struct inode *in
  
        path->p_idx = l - 1;
        ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
-                 idx_pblock(path->p_idx));
+                 ext4_idx_pblock(path->p_idx));
  
  #ifdef CHECK_BINSEARCH
        {
@@@ -614,7 -566,7 +566,7 @@@ ext4_ext_binsearch(struct inode *inode
        path->p_ext = l - 1;
        ext_debug("  -> %d:%llu:[%d]%d ",
                        le32_to_cpu(path->p_ext->ee_block),
-                       ext_pblock(path->p_ext),
+                       ext4_ext_pblock(path->p_ext),
                        ext4_ext_is_uninitialized(path->p_ext),
                        ext4_ext_get_actual_len(path->p_ext));
  
@@@ -682,7 -634,7 +634,7 @@@ ext4_ext_find_extent(struct inode *inod
                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
  
                ext4_ext_binsearch_idx(inode, path + ppos, block);
-               path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+               path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                path[ppos].p_depth = i;
                path[ppos].p_ext = NULL;
  
        ext4_ext_binsearch(inode, path + ppos, block);
        /* if not an empty leaf */
        if (path[ppos].p_ext)
-               path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+               path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
  
        ext4_ext_show_path(inode, path);
  
@@@ -739,9 -691,9 +691,9 @@@ err
   * insert new index [@logical;@ptr] into the block at @curp;
   * check where to insert: before @curp or after @curp
   */
- int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
-                               struct ext4_ext_path *curp,
-                               int logical, ext4_fsblk_t ptr)
static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+                                struct ext4_ext_path *curp,
+                                int logical, ext4_fsblk_t ptr)
  {
        struct ext4_extent_idx *ix;
        int len, err;
@@@ -917,7 -869,7 +869,7 @@@ static int ext4_ext_split(handle_t *han
                        EXT_MAX_EXTENT(path[depth].p_hdr)) {
                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
                                le32_to_cpu(path[depth].p_ext->ee_block),
-                               ext_pblock(path[depth].p_ext),
+                               ext4_ext_pblock(path[depth].p_ext),
                                ext4_ext_is_uninitialized(path[depth].p_ext),
                                ext4_ext_get_actual_len(path[depth].p_ext),
                                newblock);
                while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
                        ext_debug("%d: move %d:%llu in new index %llu\n", i,
                                        le32_to_cpu(path[i].p_idx->ei_block),
-                                       idx_pblock(path[i].p_idx),
+                                       ext4_idx_pblock(path[i].p_idx),
                                        newblock);
                        /*memmove(++fidx, path[i].p_idx++,
                                        sizeof(struct ext4_extent_idx));
@@@ -1146,7 -1098,7 +1098,7 @@@ static int ext4_ext_grow_indepth(handle
        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
-                 idx_pblock(EXT_FIRST_INDEX(neh)));
+                 ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
  
        neh->eh_depth = cpu_to_le16(path->p_depth + 1);
        err = ext4_ext_dirty(handle, inode, curp);
@@@ -1232,9 -1184,9 +1184,9 @@@ out
   * returns 0 at @phys
   * return value contains 0 (success) or error code
   */
- int
ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
-                       ext4_lblk_t *logical, ext4_fsblk_t *phys)
+ static int ext4_ext_search_left(struct inode *inode,
                              struct ext4_ext_path *path,
+                               ext4_lblk_t *logical, ext4_fsblk_t *phys)
  {
        struct ext4_extent_idx *ix;
        struct ext4_extent *ex;
        }
  
        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
-       *phys = ext_pblock(ex) + ee_len - 1;
+       *phys = ext4_ext_pblock(ex) + ee_len - 1;
        return 0;
  }
  
   * returns 0 at @phys
   * return value contains 0 (success) or error code
   */
- int
ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
-                       ext4_lblk_t *logical, ext4_fsblk_t *phys)
+ static int ext4_ext_search_right(struct inode *inode,
                               struct ext4_ext_path *path,
+                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
  {
        struct buffer_head *bh = NULL;
        struct ext4_extent_header *eh;
                        }
                }
                *logical = le32_to_cpu(ex->ee_block);
-               *phys = ext_pblock(ex);
+               *phys = ext4_ext_pblock(ex);
                return 0;
        }
  
                /* next allocated block in this leaf */
                ex++;
                *logical = le32_to_cpu(ex->ee_block);
-               *phys = ext_pblock(ex);
+               *phys = ext4_ext_pblock(ex);
                return 0;
        }
  
@@@ -1376,7 -1328,7 +1328,7 @@@ got_index
         * follow it and find the closest allocated
         * block to the right */
        ix++;
-       block = idx_pblock(ix);
+       block = ext4_idx_pblock(ix);
        while (++depth < path->p_depth) {
                bh = sb_bread(inode->i_sb, block);
                if (bh == NULL)
                        return -EIO;
                }
                ix = EXT_FIRST_INDEX(eh);
-               block = idx_pblock(ix);
+               block = ext4_idx_pblock(ix);
                put_bh(bh);
        }
  
        }
        ex = EXT_FIRST_EXTENT(eh);
        *logical = le32_to_cpu(ex->ee_block);
-       *phys = ext_pblock(ex);
+       *phys = ext4_ext_pblock(ex);
        put_bh(bh);
        return 0;
  }
@@@ -1573,7 -1525,7 +1525,7 @@@ ext4_can_extents_be_merged(struct inod
                return 0;
  #endif
  
-       if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
+       if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
                return 1;
        return 0;
  }
   * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
   * 1 if they got merged.
   */
- int ext4_ext_try_to_merge(struct inode *inode,
-                         struct ext4_ext_path *path,
-                         struct ext4_extent *ex)
static int ext4_ext_try_to_merge(struct inode *inode,
+                                struct ext4_ext_path *path,
+                                struct ext4_extent *ex)
  {
        struct ext4_extent_header *eh;
        unsigned int depth, len;
   * such that there will be no overlap, and then returns 1.
   * If there is no overlap found, it returns 0.
   */
- unsigned int ext4_ext_check_overlap(struct inode *inode,
-                                   struct ext4_extent *newext,
-                                   struct ext4_ext_path *path)
static unsigned int ext4_ext_check_overlap(struct inode *inode,
+                                          struct ext4_extent *newext,
+                                          struct ext4_ext_path *path)
  {
        ext4_lblk_t b1, b2;
        unsigned int depth, len1;
@@@ -1706,11 -1658,12 +1658,12 @@@ int ext4_ext_insert_extent(handle_t *ha
        if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
                && ext4_can_extents_be_merged(inode, ex, newext)) {
                ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
-                               ext4_ext_is_uninitialized(newext),
-                               ext4_ext_get_actual_len(newext),
-                               le32_to_cpu(ex->ee_block),
-                               ext4_ext_is_uninitialized(ex),
-                               ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                         ext4_ext_is_uninitialized(newext),
+                         ext4_ext_get_actual_len(newext),
+                         le32_to_cpu(ex->ee_block),
+                         ext4_ext_is_uninitialized(ex),
+                         ext4_ext_get_actual_len(ex),
+                         ext4_ext_pblock(ex));
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        return err;
@@@ -1780,7 -1733,7 +1733,7 @@@ has_space
                /* there is no extent in this leaf, create first one */
                ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
-                               ext_pblock(newext),
+                               ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext));
                path[depth].p_ext = EXT_FIRST_EXTENT(eh);
                        ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
                                        "move %d from 0x%p to 0x%p\n",
                                        le32_to_cpu(newext->ee_block),
-                                       ext_pblock(newext),
+                                       ext4_ext_pblock(newext),
                                        ext4_ext_is_uninitialized(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex, len, nearex + 1, nearex + 2);
                ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
                                "move %d from 0x%p to 0x%p\n",
                                le32_to_cpu(newext->ee_block),
-                               ext_pblock(newext),
+                               ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext),
                                nearex, len, nearex + 1, nearex + 2);
        le16_add_cpu(&eh->eh_entries, 1);
        nearex = path[depth].p_ext;
        nearex->ee_block = newext->ee_block;
-       ext4_ext_store_pblock(nearex, ext_pblock(newext));
+       ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
        nearex->ee_len = newext->ee_len;
  
  merge:
@@@ -1845,9 -1798,9 +1798,9 @@@ cleanup
        return err;
  }
  
- int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
-                       ext4_lblk_t num, ext_prepare_callback func,
-                       void *cbdata)
static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+                              ext4_lblk_t num, ext_prepare_callback func,
+                              void *cbdata)
  {
        struct ext4_ext_path *path = NULL;
        struct ext4_ext_cache cbex;
                } else {
                        cbex.ec_block = le32_to_cpu(ex->ee_block);
                        cbex.ec_len = ext4_ext_get_actual_len(ex);
-                       cbex.ec_start = ext_pblock(ex);
+                       cbex.ec_start = ext4_ext_pblock(ex);
                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
                }
  
@@@ -2073,7 -2026,7 +2026,7 @@@ static int ext4_ext_rm_idx(handle_t *ha
  
        /* free index block */
        path--;
-       leaf = idx_pblock(path->p_idx);
+       leaf = ext4_idx_pblock(path->p_idx);
        if (unlikely(path->p_hdr->eh_entries == 0)) {
                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
                return -EIO;
@@@ -2181,7 -2134,7 +2134,7 @@@ static int ext4_remove_blocks(handle_t 
                ext4_fsblk_t start;
  
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
-               start = ext_pblock(ex) + ee_len - num;
+               start = ext4_ext_pblock(ex) + ee_len - num;
                ext_debug("free last %u blocks starting %llu\n", num, start);
                ext4_free_blocks(handle, inode, 0, start, num, flags);
        } else if (from == le32_to_cpu(ex->ee_block)
@@@ -2310,7 -2263,7 +2263,7 @@@ ext4_ext_rm_leaf(handle_t *handle, stru
                        goto out;
  
                ext_debug("new extent: %u:%u:%llu\n", block, num,
-                               ext_pblock(ex));
+                               ext4_ext_pblock(ex));
                ex--;
                ex_ee_block = le32_to_cpu(ex->ee_block);
                ex_ee_len = ext4_ext_get_actual_len(ex);
@@@ -2421,9 -2374,9 +2374,9 @@@ again
                        struct buffer_head *bh;
                        /* go to the next level */
                        ext_debug("move to level %d (block %llu)\n",
-                                 i + 1, idx_pblock(path[i].p_idx));
+                                 i + 1, ext4_idx_pblock(path[i].p_idx));
                        memset(path + i + 1, 0, sizeof(*path));
-                       bh = sb_bread(sb, idx_pblock(path[i].p_idx));
+                       bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
                        if (!bh) {
                                /* should we reset i_size? */
                                err = -EIO;
@@@ -2535,77 -2488,22 +2488,21 @@@ void ext4_ext_release(struct super_bloc
  #endif
  }
  
- static void bi_complete(struct bio *bio, int error)
- {
-       complete((struct completion *)bio->bi_private);
- }
  /* FIXME!! we need to try to merge to left or right after zero-out  */
  static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
  {
+       ext4_fsblk_t ee_pblock;
+       unsigned int ee_len;
        int ret;
-       struct bio *bio;
-       int blkbits, blocksize;
-       sector_t ee_pblock;
-       struct completion event;
-       unsigned int ee_len, len, done, offset;
  
-       blkbits   = inode->i_blkbits;
-       blocksize = inode->i_sb->s_blocksize;
        ee_len    = ext4_ext_get_actual_len(ex);
-       ee_pblock = ext_pblock(ex);
-       /* convert ee_pblock to 512 byte sectors */
-       ee_pblock = ee_pblock << (blkbits - 9);
-       while (ee_len > 0) {
-               if (ee_len > BIO_MAX_PAGES)
-                       len = BIO_MAX_PAGES;
-               else
-                       len = ee_len;
-               bio = bio_alloc(GFP_NOIO, len);
-               if (!bio)
-                       return -ENOMEM;
+       ee_pblock = ext4_ext_pblock(ex);
  
-               bio->bi_sector = ee_pblock;
-               bio->bi_bdev   = inode->i_sb->s_bdev;
 -      ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len,
 -                             GFP_NOFS, BLKDEV_IFL_WAIT);
++      ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
+       if (ret > 0)
+               ret = 0;
  
-               done = 0;
-               offset = 0;
-               while (done < len) {
-                       ret = bio_add_page(bio, ZERO_PAGE(0),
-                                                       blocksize, offset);
-                       if (ret != blocksize) {
-                               /*
-                                * We can't add any more pages because of
-                                * hardware limitations.  Start a new bio.
-                                */
-                               break;
-                       }
-                       done++;
-                       offset += blocksize;
-                       if (offset >= PAGE_CACHE_SIZE)
-                               offset = 0;
-               }
-               init_completion(&event);
-               bio->bi_private = &event;
-               bio->bi_end_io = bi_complete;
-               submit_bio(WRITE, bio);
-               wait_for_completion(&event);
-               if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-                       bio_put(bio);
-                       return -EIO;
-               }
-               bio_put(bio);
-               ee_len    -= done;
-               ee_pblock += done  << (blkbits - 9);
-       }
-       return 0;
+       return ret;
  }
  
  #define EXT4_EXT_ZERO_LEN 7
@@@ -2651,12 -2549,12 +2548,12 @@@ static int ext4_ext_convert_to_initiali
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
-       newblock = map->m_lblk - ee_block + ext_pblock(ex);
+       newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
  
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
-       ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+       ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
  
        /*
         * It is safe to convert extent to initialized via explicit
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-               ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+               ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zeroed the full extent */
                return allocated;
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = cpu_to_le16(ee_len - allocated);
                        ext4_ext_mark_uninitialized(ex);
-                       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
  
                        ex3 = &newex;
                                        goto fix_extent_len;
                                ex->ee_block = orig_ex.ee_block;
                                ex->ee_len   = orig_ex.ee_len;
-                               ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                               ext4_ext_store_pblock(ex,
+                                       ext4_ext_pblock(&orig_ex));
                                ext4_ext_dirty(handle, inode, path + depth);
                                /* blocks available from map->m_lblk */
                                return allocated;
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
                        /* blocks available from map->m_lblk */
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zero out the first half */
                        /* blocks available from map->m_lblk */
@@@ -2902,7 -2801,7 +2800,7 @@@ insert
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-               ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+               ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zero out the first half */
                return allocated;
@@@ -2915,7 -2814,7 +2813,7 @@@ out
  fix_extent_len:
        ex->ee_block = orig_ex.ee_block;
        ex->ee_len   = orig_ex.ee_len;
-       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
        ext4_ext_mark_uninitialized(ex);
        ext4_ext_dirty(handle, inode, path + depth);
        return err;
@@@ -2973,12 -2872,12 +2871,12 @@@ static int ext4_split_unwritten_extents
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
-       newblock = map->m_lblk - ee_block + ext_pblock(ex);
+       newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
  
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
-       ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+       ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
  
        /*
         * It is safe to convert extent to initialized via explicit
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
                        /* blocks available from map->m_lblk */
@@@ -3099,7 -2998,7 +2997,7 @@@ insert
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-               ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+               ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zero out the first half */
                return allocated;
@@@ -3112,7 -3011,7 +3010,7 @@@ out
  fix_extent_len:
        ex->ee_block = orig_ex.ee_block;
        ex->ee_len   = orig_ex.ee_len;
-       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
        ext4_ext_mark_uninitialized(ex);
        ext4_ext_dirty(handle, inode, path + depth);
        return err;
@@@ -3180,6 -3079,57 +3078,57 @@@ static void unmap_underlying_metadata_b
                  unmap_underlying_metadata(bdev, block + i);
  }
  
+ /*
+  * Handle EOFBLOCKS_FL flag, clearing it if necessary
+  */
+ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+                             struct ext4_map_blocks *map,
+                             struct ext4_ext_path *path,
+                             unsigned int len)
+ {
+       int i, depth;
+       struct ext4_extent_header *eh;
+       struct ext4_extent *ex, *last_ex;
+       if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+               return 0;
+       depth = ext_depth(inode);
+       eh = path[depth].p_hdr;
+       ex = path[depth].p_ext;
+       if (unlikely(!eh->eh_entries)) {
+               EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+                                "EOFBLOCKS_FL set");
+               return -EIO;
+       }
+       last_ex = EXT_LAST_EXTENT(eh);
+       /*
+        * We should clear the EOFBLOCKS_FL flag if we are writing the
+        * last block in the last extent in the file.  We test this by
+        * first checking to see if the caller to
+        * ext4_ext_get_blocks() was interested in the last block (or
+        * a block beyond the last block) in the current extent.  If
+        * this turns out to be false, we can bail out from this
+        * function immediately.
+        */
+       if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+           ext4_ext_get_actual_len(last_ex))
+               return 0;
+       /*
+        * If the caller does appear to be planning to write at or
+        * beyond the end of the current extent, we then test to see
+        * if the current extent is the last extent in the file, by
+        * checking to make sure it was reached via the rightmost node
+        * at each level of the tree.
+        */
+       for (i = depth-1; i >= 0; i--)
+               if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+                       return 0;
+       ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+       return ext4_mark_inode_dirty(handle, inode);
+ }
  static int
  ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
                 * completed
                 */
                if (io)
-                       io->flag = EXT4_IO_UNWRITTEN;
+                       io->flag = EXT4_IO_END_UNWRITTEN;
                else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
        if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
                ret = ext4_convert_unwritten_extents_endio(handle, inode,
                                                        path);
-               if (ret >= 0)
+               if (ret >= 0) {
                        ext4_update_inode_fsync_trans(handle, inode, 1);
+                       err = check_eofblocks_fl(handle, inode, map, path,
+                                                map->m_len);
+               } else
+                       err = ret;
                goto out2;
        }
        /* buffered IO case */
  
        /* buffered write, writepage time, convert*/
        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-       if (ret >= 0)
+       if (ret >= 0) {
                ext4_update_inode_fsync_trans(handle, inode, 1);
+               err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
+               if (err < 0)
+                       goto out2;
+       }
  out:
        if (ret <= 0) {
                err = ret;
@@@ -3292,6 -3251,7 +3250,7 @@@ out2
        }
        return err ? err : allocated;
  }
  /*
   * Block allocation/map/preallocation routine for extents based files
   *
@@@ -3315,9 -3275,9 +3274,9 @@@ int ext4_ext_map_blocks(handle_t *handl
  {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
-       struct ext4_extent newex, *ex, *last_ex;
+       struct ext4_extent newex, *ex;
        ext4_fsblk_t newblock;
-       int i, err = 0, depth, ret, cache_type;
+       int err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
                        /* block is already allocated */
                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
-                                  + ext_pblock(&newex);
+                                  + ext4_ext_pblock(&newex);
                        /* number of remaining blocks in the extent */
                        allocated = ext4_ext_get_actual_len(&newex) -
                                (map->m_lblk - le32_to_cpu(newex.ee_block));
        ex = path[depth].p_ext;
        if (ex) {
                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
-               ext4_fsblk_t ee_start = ext_pblock(ex);
+               ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;
  
                /*
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                        if (io)
-                               io->flag = EXT4_IO_UNWRITTEN;
+                               io->flag = EXT4_IO_END_UNWRITTEN;
                        else
                                ext4_set_inode_state(inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
  
-       if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
-               if (unlikely(!eh->eh_entries)) {
-                       EXT4_ERROR_INODE(inode,
-                                        "eh->eh_entries == 0 and "
-                                        "EOFBLOCKS_FL set");
-                       err = -EIO;
-                       goto out2;
-               }
-               last_ex = EXT_LAST_EXTENT(eh);
-               /*
-                * If the current leaf block was reached by looking at
-                * the last index block all the way down the tree, and
-                * we are extending the inode beyond the last extent
-                * in the current leaf block, then clear the
-                * EOFBLOCKS_FL flag.
-                */
-               for (i = depth-1; i >= 0; i--) {
-                       if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
-                               break;
-               }
-               if ((i < 0) &&
-                   (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
-                    ext4_ext_get_actual_len(last_ex)))
-                       ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-       }
+       err = check_eofblocks_fl(handle, inode, map, path, ar.len);
+       if (err)
+               goto out2;
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
                /* free data blocks we just allocated */
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
-               ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
+               ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
                                 ext4_ext_get_actual_len(&newex), 0);
                goto out2;
        }
  
        /* previous routine could use block we allocated */
-       newblock = ext_pblock(&newex);
+       newblock = ext4_ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
        if (allocated > map->m_len)
                allocated = map->m_len;
@@@ -3729,7 -3668,7 +3667,7 @@@ retry
                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
-                                   inode->i_ino, block, max_blocks);
+                                   inode->i_ino, map.m_lblk, max_blocks);
  #endif
                        ext4_mark_inode_dirty(handle, inode);
                        ret2 = ext4_journal_stop(handle);
diff --combined fs/ext4/fsync.c
  
  #include <trace/events/ext4.h>
  
+ static void dump_completed_IO(struct inode * inode)
+ {
+ #ifdef        EXT4_DEBUG
+       struct list_head *cur, *before, *after;
+       ext4_io_end_t *io, *io0, *io1;
+       unsigned long flags;
+       if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+               ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
+               return;
+       }
+       ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+       list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
+               cur = &io->list;
+               before = cur->prev;
+               io0 = container_of(before, ext4_io_end_t, list);
+               after = cur->next;
+               io1 = container_of(after, ext4_io_end_t, list);
+               ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+                           io, inode->i_ino, io0, io1);
+       }
+       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+ #endif
+ }
+ /*
+  * This function is called from ext4_sync_file().
+  *
+  * When IO is completed, the work to convert unwritten extents to
+  * written is queued on workqueue but may not get immediately
+  * scheduled. When fsync is called, we need to ensure the
+  * conversion is complete before fsync returns.
+  * The inode keeps track of a list of pending/completed IO that
+  * might needs to do the conversion. This function walks through
+  * the list and convert the related unwritten extents for completed IO
+  * to written.
+  * The function return the number of pending IOs on success.
+  */
+ static int flush_completed_IO(struct inode *inode)
+ {
+       ext4_io_end_t *io;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       unsigned long flags;
+       int ret = 0;
+       int ret2 = 0;
+       if (list_empty(&ei->i_completed_io_list))
+               return ret;
+       dump_completed_IO(inode);
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+       while (!list_empty(&ei->i_completed_io_list)){
+               io = list_entry(ei->i_completed_io_list.next,
+                               ext4_io_end_t, list);
+               /*
+                * Calling ext4_end_io_nolock() to convert completed
+                * IO to written.
+                *
+                * When ext4_sync_file() is called, run_queue() may already
+                * about to flush the work corresponding to this io structure.
+                * It will be upset if it founds the io structure related
+                * to the work-to-be schedule is freed.
+                *
+                * Thus we need to keep the io structure still valid here after
+                * convertion finished. The io structure has a flag to
+                * avoid double converting from both fsync and background work
+                * queue work.
+                */
+               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+               ret = ext4_end_io_nolock(io);
+               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+               if (ret < 0)
+                       ret2 = ret;
+               else
+                       list_del_init(&io->list);
+       }
+       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+       return (ret2 < 0) ? ret2 : 0;
+ }
  /*
   * If we're not journaling and this is a just-created file, we have to
   * sync our parent directory (if it was freshly created) since
@@@ -128,9 -211,10 +211,9 @@@ int ext4_sync_file(struct file *file, i
                    (journal->j_fs_dev != journal->j_dev) &&
                    (journal->j_flags & JBD2_BARRIER))
                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
 -                                      NULL, BLKDEV_IFL_WAIT);
 +                                      NULL);
                ret = jbd2_log_wait_commit(journal, commit_tid);
        } else if (journal->j_flags & JBD2_BARRIER)
 -              blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
 -                      BLKDEV_IFL_WAIT);
 +              blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
        return ret;
  }
diff --combined fs/ext4/ialloc.c
@@@ -50,7 -50,7 +50,7 @@@
   * need to use it within a single byte (to ensure we get endianness right).
   * We can use memset for the rest of the bitmap as there are no other users.
   */
- void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
  {
        int i;
  
  }
  
  /* Initializes an uninitialized inode bitmap */
- unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
-                               ext4_group_t block_group,
-                               struct ext4_group_desc *gdp)
+ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+                                      struct buffer_head *bh,
+                                      ext4_group_t block_group,
+                                      struct ext4_group_desc *gdp)
  {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
  
@@@ -85,7 -86,7 +86,7 @@@
        }
  
        memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+       ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                        bh->b_data);
  
        return EXT4_INODES_PER_GROUP(sb);
@@@ -107,6 -108,7 +108,7 @@@ ext4_read_inode_bitmap(struct super_blo
        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return NULL;
        bitmap_blk = ext4_inode_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
                unlock_buffer(bh);
                return bh;
        }
        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
                return bh;
        }
        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@@ -411,8 -415,8 +415,8 @@@ struct orlov_stats 
   * for a particular block group or flex_bg.  If flex_size is 1, then g
   * is a block group number; otherwise it is flex_bg number.
   */
- void get_orlov_stats(struct super_block *sb, ext4_group_t g,
-                      int flex_size, struct orlov_stats *stats)
static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+                           int flex_size, struct orlov_stats *stats)
  {
        struct ext4_group_desc *desc;
        struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
@@@ -712,8 -716,17 +716,17 @@@ static int ext4_claim_inode(struct supe
  {
        int free = 0, retval = 0, count;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
  
+       /*
+        * We have to be sure that new inode allocation does not race with
+        * inode table initialization, because otherwise we may end up
+        * allocating and writing new inode right before sb_issue_zeroout
+        * takes place and overwriting our new inode with zeroes. So we
+        * take alloc_sem to prevent it.
+        */
+       down_read(&grp->alloc_sem);
        ext4_lock_group(sb, group);
        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
                /* not a free inode */
        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                        ino > EXT4_INODES_PER_GROUP(sb)) {
                ext4_unlock_group(sb, group);
+               up_read(&grp->alloc_sem);
                ext4_error(sb, "reserved inode or inode > inodes count - "
                           "block_group = %u, inode=%lu", group,
                           ino + group * EXT4_INODES_PER_GROUP(sb));
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
  err_ret:
        ext4_unlock_group(sb, group);
+       up_read(&grp->alloc_sem);
        return retval;
  }
  
@@@ -1205,3 -1220,110 +1220,109 @@@ unsigned long ext4_count_dirs(struct su
        }
        return count;
  }
 -      unsigned long flags = BLKDEV_IFL_WAIT;
+ /*
+  * Zeroes not yet zeroed inode table - just write zeroes through the whole
+  * inode table. Must be called without any spinlock held. The only place
+  * where it is called from on active part of filesystem is ext4lazyinit
+  * thread, so we do not need any special locks, however we have to prevent
+  * inode allocation from the current group, so we take alloc_sem lock, to
+  * block ext4_claim_inode until we are finished.
+  */
+ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+                                int barrier)
+ {
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_group_desc *gdp = NULL;
+       struct buffer_head *group_desc_bh;
+       handle_t *handle;
+       ext4_fsblk_t blk;
+       int num, ret = 0, used_blks = 0;
 -      if (barrier)
 -              flags |= BLKDEV_IFL_BARRIER;
 -      ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS, flags);
+       /* This should not happen, but just to be sure check this */
+       if (sb->s_flags & MS_RDONLY) {
+               ret = 1;
+               goto out;
+       }
+       gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+       if (!gdp)
+               goto out;
+       /*
+        * We do not need to lock this, because we are the only one
+        * handling this flag.
+        */
+       if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+               goto out;
+       handle = ext4_journal_start_sb(sb, 1);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               goto out;
+       }
+       down_write(&grp->alloc_sem);
+       /*
+        * If inode bitmap was already initialized there may be some
+        * used inodes so we need to skip blocks with used inodes in
+        * inode table.
+        */
+       if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+               used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+                           ext4_itable_unused_count(sb, gdp)),
+                           sbi->s_inodes_per_block);
+       if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+               ext4_error(sb, "Something is wrong with group %u\n"
+                          "Used itable blocks: %d"
+                          "itable unused count: %u\n",
+                          group, used_blks,
+                          ext4_itable_unused_count(sb, gdp));
+               ret = 1;
+               goto out;
+       }
+       blk = ext4_inode_table(sb, gdp) + used_blks;
+       num = sbi->s_itb_per_group - used_blks;
+       BUFFER_TRACE(group_desc_bh, "get_write_access");
+       ret = ext4_journal_get_write_access(handle,
+                                           group_desc_bh);
+       if (ret)
+               goto err_out;
+       /*
+        * Skip zeroout if the inode table is full. But we set the ZEROED
+        * flag anyway, because obviously, when it is full it does not need
+        * further zeroing.
+        */
+       if (unlikely(num == 0))
+               goto skip_zeroout;
+       ext4_debug("going to zero out inode table in group %d\n",
+                  group);
++      ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
+       if (ret < 0)
+               goto err_out;
++      if (barrier)
++              blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+ skip_zeroout:
+       ext4_lock_group(sb, group);
+       gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+       ext4_unlock_group(sb, group);
+       BUFFER_TRACE(group_desc_bh,
+                    "call ext4_handle_dirty_metadata");
+       ret = ext4_handle_dirty_metadata(handle, NULL,
+                                        group_desc_bh);
+ err_out:
+       up_write(&grp->alloc_sem);
+       ext4_journal_stop(handle);
+ out:
+       return ret;
+ }
diff --combined fs/ext4/inode.c
@@@ -60,6 -60,12 +60,12 @@@ static inline int ext4_begin_ordered_tr
  }
  
  static void ext4_invalidatepage(struct page *page, unsigned long offset);
+ static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create);
+ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+ static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
  
  /*
   * Test whether an inode is a fast symlink.
@@@ -755,6 -761,11 +761,11 @@@ static int ext4_alloc_branch(handle_t *
                 * parent to disk.
                 */
                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+               if (unlikely(!bh)) {
+                       err = -EIO;
+                       goto failed;
+               }
                branch[n].bh = bh;
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
@@@ -1207,8 -1218,10 +1218,10 @@@ static pgoff_t ext4_num_dirty_pages(str
                                break;
                        idx++;
                        num++;
-                       if (num >= max_pages)
+                       if (num >= max_pages) {
+                               done = 1;
                                break;
+                       }
                }
                pagevec_release(&pvec);
        }
@@@ -1538,10 -1551,10 +1551,10 @@@ static int do_journal_get_write_access(
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        /*
 -       * __block_prepare_write() could have dirtied some buffers. Clean
 +       * __block_write_begin() could have dirtied some buffers. Clean
         * the dirty bit as jbd2_journal_get_write_access() could complain
         * otherwise about fs integrity issues. Setting of the dirty bit
 -       * by __block_prepare_write() isn't a real problem here as we clear
 +       * by __block_write_begin() isn't a real problem here as we clear
         * the bit before releasing a page lock and thus writeback cannot
         * ever write the buffer.
         */
@@@ -1995,16 -2008,23 +2008,23 @@@ static void ext4_da_page_release_reserv
   *
   * As pages are already locked by write_cache_pages(), we can't use it
   */
- static int mpage_da_submit_io(struct mpage_da_data *mpd)
+ static int mpage_da_submit_io(struct mpage_da_data *mpd,
+                             struct ext4_map_blocks *map)
  {
-       long pages_skipped;
        struct pagevec pvec;
        unsigned long index, end;
        int ret = 0, err, nr_pages, i;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
+       loff_t size = i_size_read(inode);
+       unsigned int len, block_start;
+       struct buffer_head *bh, *page_bufs = NULL;
+       int journal_data = ext4_should_journal_data(inode);
+       sector_t pblock = 0, cur_logical = 0;
+       struct ext4_io_submit io_submit;
  
        BUG_ON(mpd->next_page <= mpd->first_page);
+       memset(&io_submit, 0, sizeof(io_submit));
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
+                       int commit_write = 0, redirty_page = 0;
                        struct page *page = pvec.pages[i];
  
                        index = page->index;
                        if (index > end)
                                break;
+                       if (index == size >> PAGE_CACHE_SHIFT)
+                               len = size & ~PAGE_CACHE_MASK;
+                       else
+                               len = PAGE_CACHE_SIZE;
+                       if (map) {
+                               cur_logical = index << (PAGE_CACHE_SHIFT -
+                                                       inode->i_blkbits);
+                               pblock = map->m_pblk + (cur_logical -
+                                                       map->m_lblk);
+                       }
                        index++;
  
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
  
-                       pages_skipped = mpd->wbc->pages_skipped;
-                       err = mapping->a_ops->writepage(page, mpd->wbc);
-                       if (!err && (pages_skipped == mpd->wbc->pages_skipped))
-                               /*
-                                * have successfully written the page
-                                * without skipping the same
-                                */
-                               mpd->pages_written++;
                        /*
-                        * In error case, we have to continue because
-                        * remaining pages are still locked
-                        * XXX: unlock and re-dirty them?
+                        * If the page does not have buffers (for
+                        * whatever reason), try to create them using
 -                       * block_prepare_write.  If this fails,
++                       * __block_write_begin.  If this fails,
+                        * redirty the page and move on.
                         */
-                       if (ret == 0)
-                               ret = err;
-               }
-               pagevec_release(&pvec);
-       }
-       return ret;
- }
- /*
-  * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
-  *
-  * the function goes through all passed space and put actual disk
-  * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
-  */
- static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-                                struct ext4_map_blocks *map)
- {
-       struct inode *inode = mpd->inode;
-       struct address_space *mapping = inode->i_mapping;
-       int blocks = map->m_len;
-       sector_t pblock = map->m_pblk, cur_logical;
-       struct buffer_head *head, *bh;
-       pgoff_t index, end;
-       struct pagevec pvec;
-       int nr_pages, i;
-       index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-       end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-       cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-       pagevec_init(&pvec, 0);
-       while (index <= end) {
-               /* XXX: optimize tail */
-               nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-               if (nr_pages == 0)
-                       break;
-               for (i = 0; i < nr_pages; i++) {
-                       struct page *page = pvec.pages[i];
-                       index = page->index;
-                       if (index > end)
-                               break;
-                       index++;
-                       BUG_ON(!PageLocked(page));
-                       BUG_ON(PageWriteback(page));
-                       BUG_ON(!page_has_buffers(page));
-                       bh = page_buffers(page);
-                       head = bh;
-                       /* skip blocks out of the range */
-                       do {
-                               if (cur_logical >= map->m_lblk)
-                                       break;
-                               cur_logical++;
-                       } while ((bh = bh->b_this_page) != head);
+                       if (!page_has_buffers(page)) {
 -                              if (block_prepare_write(page, 0, len,
++                              if (__block_write_begin(page, 0, len,
+                                               noalloc_get_block_write)) {
+                               redirty_page:
+                                       redirty_page_for_writepage(mpd->wbc,
+                                                                  page);
+                                       unlock_page(page);
+                                       continue;
+                               }
+                               commit_write = 1;
+                       }
  
+                       bh = page_bufs = page_buffers(page);
+                       block_start = 0;
                        do {
-                               if (cur_logical >= map->m_lblk + blocks)
-                                       break;
-                               if (buffer_delay(bh) || buffer_unwritten(bh)) {
-                                       BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
+                               if (!bh)
+                                       goto redirty_page;
+                               if (map && (cur_logical >= map->m_lblk) &&
+                                   (cur_logical <= (map->m_lblk +
+                                                    (map->m_len - 1)))) {
                                        if (buffer_delay(bh)) {
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
-                                       } else {
-                                               /*
-                                                * unwritten already should have
-                                                * blocknr assigned. Verify that
-                                                */
-                                               clear_buffer_unwritten(bh);
-                                               BUG_ON(bh->b_blocknr != pblock);
                                        }
+                                       if (buffer_unwritten(bh) ||
+                                           buffer_mapped(bh))
+                                               BUG_ON(bh->b_blocknr != pblock);
+                                       if (map->m_flags & EXT4_MAP_UNINIT)
+                                               set_buffer_uninit(bh);
+                                       clear_buffer_unwritten(bh);
+                               }
  
-                               } else if (buffer_mapped(bh))
-                                       BUG_ON(bh->b_blocknr != pblock);
-                               if (map->m_flags & EXT4_MAP_UNINIT)
-                                       set_buffer_uninit(bh);
+                               /* redirty page if block allocation undone */
+                               if (buffer_delay(bh) || buffer_unwritten(bh))
+                                       redirty_page = 1;
+                               bh = bh->b_this_page;
+                               block_start += bh->b_size;
                                cur_logical++;
                                pblock++;
-                       } while ((bh = bh->b_this_page) != head);
+                       } while (bh != page_bufs);
+                       if (redirty_page)
+                               goto redirty_page;
+                       if (commit_write)
+                               /* mark the buffer_heads as dirty & uptodate */
+                               block_commit_write(page, 0, len);
+                       /*
+                        * Delalloc doesn't support data journalling,
+                        * but eventually maybe we'll lift this
+                        * restriction.
+                        */
+                       if (unlikely(journal_data && PageChecked(page)))
+                               err = __ext4_journalled_writepage(page, len);
+                       else
+                               err = ext4_bio_write_page(&io_submit, page,
+                                                         len, mpd->wbc);
+                       if (!err)
+                               mpd->pages_written++;
+                       /*
+                        * In error case, we have to continue because
+                        * remaining pages are still locked
+                        */
+                       if (ret == 0)
+                               ret = err;
                }
                pagevec_release(&pvec);
        }
+       ext4_io_submit(&io_submit);
+       return ret;
  }
  
  static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                                        sector_t logical, long blk_cnt)
  {
@@@ -2187,35 -2193,32 +2193,32 @@@ static void ext4_print_free_blocks(stru
  }
  
  /*
-  * mpage_da_map_blocks - go through given space
+  * mpage_da_map_and_submit - go through given space, map them
+  *       if necessary, and then submit them for I/O
   *
   * @mpd - bh describing space
   *
   * The function skips space we know is already mapped to disk blocks.
   *
   */
- static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
  {
        int err, blks, get_blocks_flags;
-       struct ext4_map_blocks map;
+       struct ext4_map_blocks map, *mapp = NULL;
        sector_t next = mpd->b_blocknr;
        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
        handle_t *handle = NULL;
  
        /*
-        * We consider only non-mapped and non-allocated blocks
-        */
-       if ((mpd->b_state  & (1 << BH_Mapped)) &&
-               !(mpd->b_state & (1 << BH_Delay)) &&
-               !(mpd->b_state & (1 << BH_Unwritten)))
-               return 0;
-       /*
-        * If we didn't accumulate anything to write simply return
+        * If the blocks are mapped already, or we couldn't accumulate
+        * any blocks, then proceed immediately to the submission stage.
         */
-       if (!mpd->b_size)
-               return 0;
+       if ((mpd->b_size == 0) ||
+           ((mpd->b_state  & (1 << BH_Mapped)) &&
+            !(mpd->b_state & (1 << BH_Delay)) &&
+            !(mpd->b_state & (1 << BH_Unwritten))))
+               goto submit_io;
  
        handle = ext4_journal_current_handle();
        BUG_ON(!handle);
  
                err = blks;
                /*
-                * If get block returns with error we simply
-                * return. Later writepage will redirty the page and
-                * writepages will find the dirty page again
+                * If get block returns EAGAIN or ENOSPC and there
+                * appears to be free blocks we will call
+                * ext4_writepage() for all of the pages which will
+                * just redirty the pages.
                 */
                if (err == -EAGAIN)
-                       return 0;
+                       goto submit_io;
  
                if (err == -ENOSPC &&
                    ext4_count_free_blocks(sb)) {
                        mpd->retval = err;
-                       return 0;
+                       goto submit_io;
                }
  
                /*
                /* invalidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
                                mpd->b_size >> mpd->inode->i_blkbits);
-               return err;
+               return;
        }
        BUG_ON(blks == 0);
  
+       mapp = &map;
        if (map.m_flags & EXT4_MAP_NEW) {
                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
                int i;
                        unmap_underlying_metadata(bdev, map.m_pblk + i);
        }
  
-       /*
-        * If blocks are delayed marked, we need to
-        * put actual blocknr and drop delayed bit
-        */
-       if ((mpd->b_state & (1 << BH_Delay)) ||
-           (mpd->b_state & (1 << BH_Unwritten)))
-               mpage_put_bnr_to_bhs(mpd, &map);
        if (ext4_should_order_data(mpd->inode)) {
                err = ext4_jbd2_file_inode(handle, mpd->inode);
                if (err)
-                       return err;
+                       /* This only happens if the journal is aborted */
+                       return;
        }
  
        /*
                disksize = i_size_read(mpd->inode);
        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
                ext4_update_i_disksize(mpd->inode, disksize);
-               return ext4_mark_inode_dirty(handle, mpd->inode);
+               err = ext4_mark_inode_dirty(handle, mpd->inode);
+               if (err)
+                       ext4_error(mpd->inode->i_sb,
+                                  "Failed to mark inode %lu dirty",
+                                  mpd->inode->i_ino);
        }
  
-       return 0;
+ submit_io:
+       mpage_da_submit_io(mpd, mapp);
+       mpd->io_done = 1;
  }
  
  #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@@ -2401,9 -2405,7 +2405,7 @@@ flush_it
         * We couldn't merge the block to our extent, so we
         * need to flush current  extent and start new one
         */
-       if (mpage_da_map_blocks(mpd) == 0)
-               mpage_da_submit_io(mpd);
-       mpd->io_done = 1;
+       mpage_da_map_and_submit(mpd);
        return;
  }
  
@@@ -2422,9 -2424,9 +2424,9 @@@ static int ext4_bh_delay_or_unwritten(h
   * The function finds extents of pages and scan them for all blocks.
   */
  static int __mpage_da_writepage(struct page *page,
-                               struct writeback_control *wbc, void *data)
+                               struct writeback_control *wbc,
+                               struct mpage_da_data *mpd)
  {
-       struct mpage_da_data *mpd = data;
        struct inode *inode = mpd->inode;
        struct buffer_head *bh, *head;
        sector_t logical;
        if (mpd->next_page != page->index) {
                /*
                 * Nope, we can't. So, we map non-allocated blocks
-                * and start IO on them using writepage()
+                * and start IO on them
                 */
                if (mpd->next_page != mpd->first_page) {
-                       if (mpage_da_map_blocks(mpd) == 0)
-                               mpage_da_submit_io(mpd);
+                       mpage_da_map_and_submit(mpd);
                        /*
                         * skip rest of the page in the page_vec
                         */
-                       mpd->io_done = 1;
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return MPAGE_DA_EXTENT_TAIL;
@@@ -2550,7 -2550,8 +2550,7 @@@ static int ext4_da_get_block_prep(struc
                if (buffer_delay(bh))
                        return 0; /* Not sure this could or should happen */
                /*
 -               * XXX: __block_prepare_write() unmaps passed block,
 -               * is it OK?
 +               * XXX: __block_write_begin() unmaps passed block, is it OK?
                 */
                ret = ext4_da_reserve_space(inode, iblock);
                if (ret)
  /*
   * This function is used as a standard get_block_t calback function
   * when there is no desire to allocate any blocks.  It is used as a
 - * callback function for block_prepare_write() and block_write_full_page().
 + * callback function for block_write_begin() and block_write_full_page().
   * These functions should only try to map a single block at a time.
   *
   * Since this function doesn't do block allocations even if the caller
@@@ -2622,6 -2623,7 +2622,7 @@@ static int __ext4_journalled_writepage(
        int ret = 0;
        int err;
  
+       ClearPageChecked(page);
        page_bufs = page_buffers(page);
        BUG_ON(!page_bufs);
        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@@ -2699,7 -2701,7 +2700,7 @@@ static void ext4_end_io_buffer_write(st
  static int ext4_writepage(struct page *page,
                          struct writeback_control *wbc)
  {
-       int ret = 0;
+       int ret = 0, commit_write = 0;
        loff_t size;
        unsigned int len;
        struct buffer_head *page_bufs = NULL;
        else
                len = PAGE_CACHE_SIZE;
  
-       if (page_has_buffers(page)) {
-               page_bufs = page_buffers(page);
-               if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                       ext4_bh_delay_or_unwritten)) {
-                       /*
-                        * We don't want to do  block allocation
-                        * So redirty the page and return
-                        * We may reach here when we do a journal commit
-                        * via journal_submit_inode_data_buffers.
-                        * If we don't have mapping block we just ignore
-                        * them. We can also reach here via shrink_page_list
-                        */
+       /*
+        * If the page does not have buffers (for whatever reason),
 -       * try to create them using block_prepare_write.  If this
++       * try to create them using __block_write_begin.  If this
+        * fails, redirty the page and move on.
+        */
+       if (!page_buffers(page)) {
 -              if (block_prepare_write(page, 0, len,
++              if (__block_write_begin(page, 0, len,
+                                       noalloc_get_block_write)) {
+               redirty_page:
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return 0;
                }
-       } else {
+               commit_write = 1;
+       }
+       page_bufs = page_buffers(page);
+       if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                             ext4_bh_delay_or_unwritten)) {
                /*
-                * The test for page_has_buffers() is subtle:
-                * We know the page is dirty but it lost buffers. That means
-                * that at some moment in time after write_begin()/write_end()
-                * has been called all buffers have been clean and thus they
-                * must have been written at least once. So they are all
-                * mapped and we can happily proceed with mapping them
-                * and writing the page.
-                *
-                * Try to initialize the buffer_heads and check whether
-                * all are mapped and non delay. We don't want to
-                * do block allocation here.
+                * We don't want to do block allocation So redirty the
+                * page and return We may reach here when we do a
+                * journal commit via
+                * journal_submit_inode_data_buffers.  If we don't
+                * have mapping block we just ignore them. We can also
+                * reach here via shrink_page_list
                 */
-               ret = __block_write_begin(page, 0, len,
-                                         noalloc_get_block_write);
-               if (!ret) {
-                       page_bufs = page_buffers(page);
-                       /* check whether all are mapped and non delay */
-                       if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                               ext4_bh_delay_or_unwritten)) {
-                               redirty_page_for_writepage(wbc, page);
-                               unlock_page(page);
-                               return 0;
-                       }
-               } else {
-                       /*
-                        * We can't do block allocation here
-                        * so just redity the page and unlock
-                        * and return
-                        */
-                       redirty_page_for_writepage(wbc, page);
-                       unlock_page(page);
-                       return 0;
-               }
+               goto redirty_page;
+       }
+       if (commit_write)
                /* now mark the buffer_heads as dirty and uptodate */
                block_commit_write(page, 0, len);
-       }
  
-       if (PageChecked(page) && ext4_should_journal_data(inode)) {
+       if (PageChecked(page) && ext4_should_journal_data(inode))
                /*
                 * It's mmapped pagecache.  Add buffers and journal it.  There
                 * doesn't seem much point in redirtying the page here.
                 */
-               ClearPageChecked(page);
                return __ext4_journalled_writepage(page, len);
-       }
  
-       if (page_bufs && buffer_uninit(page_bufs)) {
+       if (buffer_uninit(page_bufs)) {
                ext4_set_bh_endio(page_bufs, inode);
                ret = block_write_full_page_endio(page, noalloc_get_block_write,
                                            wbc, ext4_end_io_buffer_write);
@@@ -2823,25 -2800,32 +2799,32 @@@ static int ext4_da_writepages_trans_blo
   */
  static int write_cache_pages_da(struct address_space *mapping,
                                struct writeback_control *wbc,
-                               struct mpage_da_data *mpd)
+                               struct mpage_da_data *mpd,
+                               pgoff_t *done_index)
  {
        int ret = 0;
        int done = 0;
        struct pagevec pvec;
-       int nr_pages;
+       unsigned nr_pages;
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        long nr_to_write = wbc->nr_to_write;
+       int tag;
  
        pagevec_init(&pvec, 0);
        index = wbc->range_start >> PAGE_CACHE_SHIFT;
        end = wbc->range_end >> PAGE_CACHE_SHIFT;
  
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag = PAGECACHE_TAG_TOWRITE;
+       else
+               tag = PAGECACHE_TAG_DIRTY;
+       *done_index = index;
        while (!done && (index <= end)) {
                int i;
  
-               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-                             PAGECACHE_TAG_DIRTY,
+               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
                        break;
                                break;
                        }
  
+                       *done_index = page->index + 1;
                        lock_page(page);
  
                        /*
@@@ -2946,6 -2932,8 +2931,8 @@@ static int ext4_da_writepages(struct ad
        long desired_nr_to_write, nr_to_writebump = 0;
        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+       pgoff_t done_index = 0;
+       pgoff_t end;
  
        trace_ext4_da_writepages(inode, wbc);
  
                wbc->range_start = index << PAGE_CACHE_SHIFT;
                wbc->range_end  = LLONG_MAX;
                wbc->range_cyclic = 0;
-       } else
+               end = -1;
+       } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+               end = wbc->range_end >> PAGE_CACHE_SHIFT;
+       }
  
        /*
         * This works around two forms of stupidity.  The first is in
         * sbi->max_writeback_mb_bump whichever is smaller.
         */
        max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-       if (!range_cyclic && range_whole)
-               desired_nr_to_write = wbc->nr_to_write * 8;
-       else
+       if (!range_cyclic && range_whole) {
+               if (wbc->nr_to_write == LONG_MAX)
+                       desired_nr_to_write = wbc->nr_to_write;
+               else
+                       desired_nr_to_write = wbc->nr_to_write * 8;
+       } else
                desired_nr_to_write = ext4_num_dirty_pages(inode, index,
                                                           max_pages);
        if (desired_nr_to_write > max_pages)
        pages_skipped = wbc->pages_skipped;
  
  retry:
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag_pages_for_writeback(mapping, index, end);
        while (!ret && wbc->nr_to_write > 0) {
  
                /*
                mpd.io_done = 0;
                mpd.pages_written = 0;
                mpd.retval = 0;
-               ret = write_cache_pages_da(mapping, wbc, &mpd);
+               ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
                 * them for I/O.
                 */
                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-                       if (mpage_da_map_blocks(&mpd) == 0)
-                               mpage_da_submit_io(&mpd);
-                       mpd.io_done = 1;
+                       mpage_da_map_and_submit(&mpd);
                        ret = MPAGE_DA_EXTENT_TAIL;
                }
                trace_ext4_da_write_pages(inode, &mpd);
                         __func__, wbc->nr_to_write, ret);
  
        /* Update index */
-       index += pages_written;
        wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
-               mapping->writeback_index = index;
+               mapping->writeback_index = done_index;
  
  out_writepages:
        wbc->nr_to_write -= nr_to_writebump;
@@@ -3456,15 -3450,6 +3449,6 @@@ ext4_readpages(struct file *file, struc
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
  }
  
- static void ext4_free_io_end(ext4_io_end_t *io)
- {
-       BUG_ON(!io);
-       if (io->page)
-               put_page(io->page);
-       iput(io->inode);
-       kfree(io);
- }
  static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
  {
        struct buffer_head *head, *bh;
@@@ -3641,173 -3626,6 +3625,6 @@@ static int ext4_get_block_write(struct 
                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
  }
  
- static void dump_completed_IO(struct inode * inode)
- {
- #ifdef        EXT4_DEBUG
-       struct list_head *cur, *before, *after;
-       ext4_io_end_t *io, *io0, *io1;
-       unsigned long flags;
-       if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-               ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
-               return;
-       }
-       ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-       list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
-               cur = &io->list;
-               before = cur->prev;
-               io0 = container_of(before, ext4_io_end_t, list);
-               after = cur->next;
-               io1 = container_of(after, ext4_io_end_t, list);
-               ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-                           io, inode->i_ino, io0, io1);
-       }
-       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
- #endif
- }
- /*
-  * check a range of space and convert unwritten extents to written.
-  */
- static int ext4_end_io_nolock(ext4_io_end_t *io)
- {
-       struct inode *inode = io->inode;
-       loff_t offset = io->offset;
-       ssize_t size = io->size;
-       int ret = 0;
-       ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
-                  "list->prev 0x%p\n",
-                  io, inode->i_ino, io->list.next, io->list.prev);
-       if (list_empty(&io->list))
-               return ret;
-       if (io->flag != EXT4_IO_UNWRITTEN)
-               return ret;
-       ret = ext4_convert_unwritten_extents(inode, offset, size);
-       if (ret < 0) {
-               printk(KERN_EMERG "%s: failed to convert unwritten"
-                       "extents to written extents, error is %d"
-                       " io is still on inode %lu aio dio list\n",
-                        __func__, ret, inode->i_ino);
-               return ret;
-       }
-       if (io->iocb)
-               aio_complete(io->iocb, io->result, 0);
-       /* clear the DIO AIO unwritten flag */
-       io->flag = 0;
-       return ret;
- }
- /*
-  * work on completed aio dio IO, to convert unwritten extents to extents
-  */
- static void ext4_end_io_work(struct work_struct *work)
- {
-       ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
-       struct inode            *inode = io->inode;
-       struct ext4_inode_info  *ei = EXT4_I(inode);
-       unsigned long           flags;
-       int                     ret;
-       mutex_lock(&inode->i_mutex);
-       ret = ext4_end_io_nolock(io);
-       if (ret < 0) {
-               mutex_unlock(&inode->i_mutex);
-               return;
-       }
-       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       if (!list_empty(&io->list))
-               list_del_init(&io->list);
-       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-       mutex_unlock(&inode->i_mutex);
-       ext4_free_io_end(io);
- }
- /*
-  * This function is called from ext4_sync_file().
-  *
-  * When IO is completed, the work to convert unwritten extents to
-  * written is queued on workqueue but may not get immediately
-  * scheduled. When fsync is called, we need to ensure the
-  * conversion is complete before fsync returns.
-  * The inode keeps track of a list of pending/completed IO that
-  * might needs to do the conversion. This function walks through
-  * the list and convert the related unwritten extents for completed IO
-  * to written.
-  * The function return the number of pending IOs on success.
-  */
- int flush_completed_IO(struct inode *inode)
- {
-       ext4_io_end_t *io;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       unsigned long flags;
-       int ret = 0;
-       int ret2 = 0;
-       if (list_empty(&ei->i_completed_io_list))
-               return ret;
-       dump_completed_IO(inode);
-       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       while (!list_empty(&ei->i_completed_io_list)){
-               io = list_entry(ei->i_completed_io_list.next,
-                               ext4_io_end_t, list);
-               /*
-                * Calling ext4_end_io_nolock() to convert completed
-                * IO to written.
-                *
-                * When ext4_sync_file() is called, run_queue() may already
-                * about to flush the work corresponding to this io structure.
-                * It will be upset if it founds the io structure related
-                * to the work-to-be schedule is freed.
-                *
-                * Thus we need to keep the io structure still valid here after
-                * convertion finished. The io structure has a flag to
-                * avoid double converting from both fsync and background work
-                * queue work.
-                */
-               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-               ret = ext4_end_io_nolock(io);
-               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-               if (ret < 0)
-                       ret2 = ret;
-               else
-                       list_del_init(&io->list);
-       }
-       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-       return (ret2 < 0) ? ret2 : 0;
- }
- static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
- {
-       ext4_io_end_t *io = NULL;
-       io = kmalloc(sizeof(*io), flags);
-       if (io) {
-               igrab(inode);
-               io->inode = inode;
-               io->flag = 0;
-               io->offset = 0;
-               io->size = 0;
-               io->page = NULL;
-               io->iocb = NULL;
-               io->result = 0;
-               INIT_WORK(&io->work, ext4_end_io_work);
-               INIT_LIST_HEAD(&io->list);
-       }
-       return io;
- }
  static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                            ssize_t size, void *private, int ret,
                            bool is_async)
                  size);
  
        /* if not aio dio with unwritten extents, just free io and return */
-       if (io_end->flag != EXT4_IO_UNWRITTEN){
+       if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                ext4_free_io_end(io_end);
                iocb->private = NULL;
  out:
        }
        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
  
-       /* queue the work to convert unwritten extents to written */
-       queue_work(wq, &io_end->work);
        /* Add the io_end to per-inode completed aio dio list*/
        ei = EXT4_I(io_end->inode);
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        list_add_tail(&io_end->list, &ei->i_completed_io_list);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+       /* queue the work to convert unwritten extents to written */
+       queue_work(wq, &io_end->work);
        iocb->private = NULL;
  }
  
@@@ -3872,7 -3690,7 +3689,7 @@@ static void ext4_end_io_buffer_write(st
                goto out;
        }
  
-       io_end->flag = EXT4_IO_UNWRITTEN;
+       io_end->flag = EXT4_IO_END_UNWRITTEN;
        inode = io_end->inode;
  
        /* Add the io_end to per-inode completed io list*/
@@@ -5463,6 -5281,7 +5280,7 @@@ int ext4_setattr(struct dentry *dentry
  {
        struct inode *inode = dentry->d_inode;
        int error, rc = 0;
+       int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;
  
        error = inode_change_ok(inode, attr);
                        error = PTR_ERR(handle);
                        goto err_out;
                }
-               error = ext4_orphan_add(handle, inode);
+               if (ext4_handle_valid(handle)) {
+                       error = ext4_orphan_add(handle, inode);
+                       orphan = 1;
+               }
                EXT4_I(inode)->i_disksize = attr->ia_size;
                rc = ext4_mark_inode_dirty(handle, inode);
                if (!error)
                                        goto err_out;
                                }
                                ext4_orphan_del(handle, inode);
+                               orphan = 0;
                                ext4_journal_stop(handle);
                                goto err_out;
                        }
         * If the call to ext4_truncate failed to get a transaction handle at
         * all, we need to clean up the in-core orphan list manually.
         */
-       if (inode->i_nlink)
+       if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);
  
        if (!rc && (ia_valid & ATTR_MODE))
@@@ -5642,7 -5464,7 +5463,7 @@@ static int ext4_index_trans_blocks(stru
   *
   * Also account for superblock, inode, quota and xattr blocks
   */
- int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  {
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
diff --combined fs/ext4/mballoc.c
  static struct kmem_cache *ext4_pspace_cachep;
  static struct kmem_cache *ext4_ac_cachep;
  static struct kmem_cache *ext4_free_ext_cachep;
+ /* We create slab caches for groupinfo data structures based on the
+  * superblock block size.  There will be one per mounted filesystem for
+  * each unique s_blocksize_bits */
+ #define NR_GRPINFO_CACHES     \
+       (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
+ static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
  static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
  static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@@ -938,6 -946,85 +946,85 @@@ out
        return err;
  }
  
+ /*
+  * lock the group_info alloc_sem of all the groups
+  * belonging to the same buddy cache page. This
+  * make sure other parallel operation on the buddy
+  * cache doesn't happen  whild holding the buddy cache
+  * lock
+  */
+ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
+                                       ext4_group_t group)
+ {
+       int i;
+       int block, pnum;
+       int blocks_per_page;
+       int groups_per_page;
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
+       ext4_group_t first_group;
+       struct ext4_group_info *grp;
+       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+       /*
+        * the buddy cache inode stores the block bitmap
+        * and buddy information in consecutive blocks.
+        * So for each group we need two blocks.
+        */
+       block = group * 2;
+       pnum = block / blocks_per_page;
+       first_group = pnum * blocks_per_page / 2;
+       groups_per_page = blocks_per_page >> 1;
+       if (groups_per_page == 0)
+               groups_per_page = 1;
+       /* read all groups the page covers into the cache */
+       for (i = 0; i < groups_per_page; i++) {
+               if ((first_group + i) >= ngroups)
+                       break;
+               grp = ext4_get_group_info(sb, first_group + i);
+               /* take all groups write allocation
+                * semaphore. This make sure there is
+                * no block allocation going on in any
+                * of that groups
+                */
+               down_write_nested(&grp->alloc_sem, i);
+       }
+       return i;
+ }
+ static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+                                        ext4_group_t group, int locked_group)
+ {
+       int i;
+       int block, pnum;
+       int blocks_per_page;
+       ext4_group_t first_group;
+       struct ext4_group_info *grp;
+       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+       /*
+        * the buddy cache inode stores the block bitmap
+        * and buddy information in consecutive blocks.
+        * So for each group we need two blocks.
+        */
+       block = group * 2;
+       pnum = block / blocks_per_page;
+       first_group = pnum * blocks_per_page / 2;
+       /* release locks on all the groups */
+       for (i = 0; i < locked_group; i++) {
+               grp = ext4_get_group_info(sb, first_group + i);
+               /* take all groups write allocation
+                * semaphore. This make sure there is
+                * no block allocation going on in any
+                * of that groups
+                */
+               up_write(&grp->alloc_sem);
+       }
+ }
  /*
   * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
   * block group lock of all groups for this page; do not hold the BG lock when
@@@ -1915,84 -2002,6 +2002,6 @@@ static int ext4_mb_good_group(struct ex
        return 0;
  }
  
- /*
-  * lock the group_info alloc_sem of all the groups
-  * belonging to the same buddy cache page. This
-  * make sure other parallel operation on the buddy
-  * cache doesn't happen  whild holding the buddy cache
-  * lock
-  */
- int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
- {
-       int i;
-       int block, pnum;
-       int blocks_per_page;
-       int groups_per_page;
-       ext4_group_t ngroups = ext4_get_groups_count(sb);
-       ext4_group_t first_group;
-       struct ext4_group_info *grp;
-       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-       /*
-        * the buddy cache inode stores the block bitmap
-        * and buddy information in consecutive blocks.
-        * So for each group we need two blocks.
-        */
-       block = group * 2;
-       pnum = block / blocks_per_page;
-       first_group = pnum * blocks_per_page / 2;
-       groups_per_page = blocks_per_page >> 1;
-       if (groups_per_page == 0)
-               groups_per_page = 1;
-       /* read all groups the page covers into the cache */
-       for (i = 0; i < groups_per_page; i++) {
-               if ((first_group + i) >= ngroups)
-                       break;
-               grp = ext4_get_group_info(sb, first_group + i);
-               /* take all groups write allocation
-                * semaphore. This make sure there is
-                * no block allocation going on in any
-                * of that groups
-                */
-               down_write_nested(&grp->alloc_sem, i);
-       }
-       return i;
- }
- void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
-                                       ext4_group_t group, int locked_group)
- {
-       int i;
-       int block, pnum;
-       int blocks_per_page;
-       ext4_group_t first_group;
-       struct ext4_group_info *grp;
-       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-       /*
-        * the buddy cache inode stores the block bitmap
-        * and buddy information in consecutive blocks.
-        * So for each group we need two blocks.
-        */
-       block = group * 2;
-       pnum = block / blocks_per_page;
-       first_group = pnum * blocks_per_page / 2;
-       /* release locks on all the groups */
-       for (i = 0; i < locked_group; i++) {
-               grp = ext4_get_group_info(sb, first_group + i);
-               /* take all groups write allocation
-                * semaphore. This make sure there is
-                * no block allocation going on in any
-                * of that groups
-                */
-               up_write(&grp->alloc_sem);
-       }
- }
  static noinline_for_stack int
  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
  {
@@@ -2233,15 -2242,24 +2242,24 @@@ static const struct file_operations ext
        .release        = seq_release,
  };
  
+ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+ {
+       int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+       struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+       BUG_ON(!cachep);
+       return cachep;
+ }
  
  /* Create and initialize ext4_group_info data for the given group. */
  int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                          struct ext4_group_desc *desc)
  {
-       int i, len;
+       int i;
        int metalen = 0;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info **meta_group_info;
+       struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
  
        /*
         * First check if this group is the first of a reserved block.
                        meta_group_info;
        }
  
-       /*
-        * calculate needed size. if change bb_counters size,
-        * don't forget about ext4_mb_generate_buddy()
-        */
-       len = offsetof(typeof(**meta_group_info),
-                      bb_counters[sb->s_blocksize_bits + 2]);
        meta_group_info =
                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
  
-       meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+       meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
        if (meta_group_info[i] == NULL) {
                printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
                goto exit_group_info;
        }
+       memset(meta_group_info[i], 0, kmem_cache_size(cachep));
        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
                &(meta_group_info[i]->bb_state));
  
@@@ -2331,6 -2343,7 +2343,7 @@@ static int ext4_mb_init_backend(struct 
        int num_meta_group_infos_max;
        int array_size;
        struct ext4_group_desc *desc;
+       struct kmem_cache *cachep;
  
        /* This is the number of blocks used by GDT */
        num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
                printk(KERN_ERR "EXT4-fs: can't get new inode\n");
                goto err_freesgi;
        }
 +      sbi->s_buddy_cache->i_ino = get_next_ino();
        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
        for (i = 0; i < ngroups; i++) {
                desc = ext4_get_group_desc(sb, i, NULL);
        return 0;
  
  err_freebuddy:
+       cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        while (i-- > 0)
-               kfree(ext4_get_group_info(sb, i));
+               kmem_cache_free(cachep, ext4_get_group_info(sb, i));
        i = num_meta_group_infos;
        while (i-- > 0)
                kfree(sbi->s_group_info[i]);
@@@ -2407,19 -2420,48 +2421,48 @@@ int ext4_mb_init(struct super_block *sb
        unsigned offset;
        unsigned max;
        int ret;
+       int cache_index;
+       struct kmem_cache *cachep;
+       char *namep = NULL;
  
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
  
        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_offsets == NULL) {
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
        }
  
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
-               kfree(sbi->s_mb_offsets);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
+       }
+       cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+       cachep = ext4_groupinfo_caches[cache_index];
+       if (!cachep) {
+               char name[32];
+               int len = offsetof(struct ext4_group_info,
+                                       bb_counters[sb->s_blocksize_bits + 2]);
+               sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
+               namep = kstrdup(name, GFP_KERNEL);
+               if (!namep) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               /* Need to free the kmem_cache_name() when we
+                * destroy the slab */
+               cachep = kmem_cache_create(namep, len, 0,
+                                            SLAB_RECLAIM_ACCOUNT, NULL);
+               if (!cachep) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               ext4_groupinfo_caches[cache_index] = cachep;
        }
  
        /* order 0 is regular bitmap */
        /* init file for buddy data */
        ret = ext4_mb_init_backend(sb);
        if (ret != 0) {
-               kfree(sbi->s_mb_offsets);
-               kfree(sbi->s_mb_maxs);
-               return ret;
+               goto out;
        }
  
        spin_lock_init(&sbi->s_md_lock);
  
        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
        if (sbi->s_locality_groups == NULL) {
-               kfree(sbi->s_mb_offsets);
-               kfree(sbi->s_mb_maxs);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
        }
        for_each_possible_cpu(i) {
                struct ext4_locality_group *lg;
  
        if (sbi->s_journal)
                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-       return 0;
+ out:
+       if (ret) {
+               kfree(sbi->s_mb_offsets);
+               kfree(sbi->s_mb_maxs);
+               kfree(namep);
+       }
+       return ret;
  }
  
  /* need to called with the ext4 group lock held */
@@@ -2504,6 -2549,7 +2550,7 @@@ int ext4_mb_release(struct super_block 
        int num_meta_group_infos;
        struct ext4_group_info *grinfo;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
  
        if (sbi->s_group_info) {
                for (i = 0; i < ngroups; i++) {
                        ext4_lock_group(sb, i);
                        ext4_mb_cleanup_pa(grinfo);
                        ext4_unlock_group(sb, i);
-                       kfree(grinfo);
+                       kmem_cache_free(cachep, grinfo);
                }
                num_meta_group_infos = (ngroups +
                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
        return 0;
  }
  
- static inline void ext4_issue_discard(struct super_block *sb,
+ static inline int ext4_issue_discard(struct super_block *sb,
                ext4_group_t block_group, ext4_grpblk_t block, int count)
  {
        int ret;
        discard_block = block + ext4_group_first_block_no(sb, block_group);
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);
 -      ret = sb_issue_discard(sb, discard_block, count);
 +      ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
-       if (ret == EOPNOTSUPP) {
+       if (ret == -EOPNOTSUPP) {
                ext4_warning(sb, "discard not supported, disabling");
                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
        }
+       return ret;
  }
  
  /*
@@@ -2659,28 -2706,22 +2707,22 @@@ static void ext4_remove_debugfs_entry(v
  
  #endif
  
- int __init init_ext4_mballoc(void)
+ int __init ext4_init_mballoc(void)
  {
-       ext4_pspace_cachep =
-               kmem_cache_create("ext4_prealloc_space",
-                                    sizeof(struct ext4_prealloc_space),
-                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
+       ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
+                                       SLAB_RECLAIM_ACCOUNT);
        if (ext4_pspace_cachep == NULL)
                return -ENOMEM;
  
-       ext4_ac_cachep =
-               kmem_cache_create("ext4_alloc_context",
-                                    sizeof(struct ext4_allocation_context),
-                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
+       ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
+                                   SLAB_RECLAIM_ACCOUNT);
        if (ext4_ac_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                return -ENOMEM;
        }
  
-       ext4_free_ext_cachep =
-               kmem_cache_create("ext4_free_block_extents",
-                                    sizeof(struct ext4_free_data),
-                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
+       ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
+                                         SLAB_RECLAIM_ACCOUNT);
        if (ext4_free_ext_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                kmem_cache_destroy(ext4_ac_cachep);
        return 0;
  }
  
- void exit_ext4_mballoc(void)
+ void ext4_exit_mballoc(void)
  {
+       int i;
        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_ext_cachep);
+       for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+               struct kmem_cache *cachep = ext4_groupinfo_caches[i];
+               if (cachep) {
+                       char *name = (char *)kmem_cache_name(cachep);
+                       kmem_cache_destroy(cachep);
+                       kfree(name);
+               }
+       }
        ext4_remove_debugfs_entry();
  }
  
@@@ -3536,8 -3587,7 +3588,7 @@@ static int ext4_mb_new_preallocation(st
   */
  static noinline_for_stack int
  ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
-                       struct ext4_prealloc_space *pa,
-                       struct ext4_allocation_context *ac)
+                       struct ext4_prealloc_space *pa)
  {
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;
  
-       if (ac) {
-               ac->ac_sb = sb;
-               ac->ac_inode = pa->pa_inode;
-       }
        while (bit < end) {
                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
                if (bit >= end)
                         (unsigned) next - bit, (unsigned) group);
                free += next - bit;
  
-               if (ac) {
-                       ac->ac_b_ex.fe_group = group;
-                       ac->ac_b_ex.fe_start = bit;
-                       ac->ac_b_ex.fe_len = next - bit;
-                       ac->ac_b_ex.fe_logical = 0;
-                       trace_ext4_mballoc_discard(ac);
-               }
-               trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
-                                              next - bit);
+               trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
+               trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
+                                              grp_blk_start + bit, next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
  
  static noinline_for_stack int
  ext4_mb_release_group_pa(struct ext4_buddy *e4b,
-                               struct ext4_prealloc_space *pa,
-                               struct ext4_allocation_context *ac)
+                               struct ext4_prealloc_space *pa)
  {
        struct super_block *sb = e4b->bd_sb;
        ext4_group_t group;
        ext4_grpblk_t bit;
  
-       trace_ext4_mb_release_group_pa(sb, ac, pa);
+       trace_ext4_mb_release_group_pa(sb, pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
-       if (ac) {
-               ac->ac_sb = sb;
-               ac->ac_inode = NULL;
-               ac->ac_b_ex.fe_group = group;
-               ac->ac_b_ex.fe_start = bit;
-               ac->ac_b_ex.fe_len = pa->pa_len;
-               ac->ac_b_ex.fe_logical = 0;
-               trace_ext4_mballoc_discard(ac);
-       }
+       trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
  
        return 0;
  }
@@@ -3645,7 -3673,6 +3674,6 @@@ ext4_mb_discard_group_preallocations(st
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
-       struct ext4_allocation_context *ac;
        struct list_head list;
        struct ext4_buddy e4b;
        int err;
                needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
  
        INIT_LIST_HEAD(&list);
-       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-       if (ac)
-               ac->ac_sb = sb;
  repeat:
        ext4_lock_group(sb, group);
        list_for_each_entry_safe(pa, tmp,
                spin_unlock(pa->pa_obj_lock);
  
                if (pa->pa_type == MB_GROUP_PA)
-                       ext4_mb_release_group_pa(&e4b, pa, ac);
+                       ext4_mb_release_group_pa(&e4b, pa);
                else
-                       ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+                       ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
  
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
  
  out:
        ext4_unlock_group(sb, group);
-       if (ac)
-               kmem_cache_free(ext4_ac_cachep, ac);
        ext4_mb_unload_buddy(&e4b);
        put_bh(bitmap_bh);
        return free;
@@@ -3763,7 -3785,6 +3786,6 @@@ void ext4_discard_preallocations(struc
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
-       struct ext4_allocation_context *ac;
        ext4_group_t group = 0;
        struct list_head list;
        struct ext4_buddy e4b;
  
        INIT_LIST_HEAD(&list);
  
-       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-       if (ac) {
-               ac->ac_sb = sb;
-               ac->ac_inode = inode;
-       }
  repeat:
        /* first, collect all pa's in the inode */
        spin_lock(&ei->i_prealloc_lock);
  
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
-               ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+               ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                ext4_unlock_group(sb, group);
  
                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
-       if (ac)
-               kmem_cache_free(ext4_ac_cachep, ac);
  }
  
  /*
@@@ -4061,14 -4075,10 +4076,10 @@@ ext4_mb_discard_lg_preallocations(struc
        struct ext4_buddy e4b;
        struct list_head discard_list;
        struct ext4_prealloc_space *pa, *tmp;
-       struct ext4_allocation_context *ac;
  
        mb_debug(1, "discard locality group preallocation\n");
  
        INIT_LIST_HEAD(&discard_list);
-       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-       if (ac)
-               ac->ac_sb = sb;
  
        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
                }
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
-               ext4_mb_release_group_pa(&e4b, pa, ac);
+               ext4_mb_release_group_pa(&e4b, pa);
                ext4_unlock_group(sb, group);
  
                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
-       if (ac)
-               kmem_cache_free(ext4_ac_cachep, ac);
  }
  
  /*
@@@ -4492,7 -4500,6 +4501,6 @@@ void ext4_free_blocks(handle_t *handle
  {
        struct buffer_head *bitmap_bh = NULL;
        struct super_block *sb = inode->i_sb;
-       struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
        unsigned long freed = 0;
        unsigned int overflow;
                        if (!bh)
                                tbh = sb_find_get_block(inode->i_sb,
                                                        block + i);
+                       if (unlikely(!tbh))
+                               continue;
                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                    inode, tbh, block + i);
                }
        if (!ext4_should_writeback_data(inode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
  
-       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-       if (ac) {
-               ac->ac_inode = inode;
-               ac->ac_sb = sb;
-       }
  do_more:
        overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
        }
  #endif
-       if (ac) {
-               ac->ac_b_ex.fe_group = block_group;
-               ac->ac_b_ex.fe_start = bit;
-               ac->ac_b_ex.fe_len = count;
-               trace_ext4_mballoc_free(ac);
-       }
+       trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
  
        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
                 * with group lock held. generate_buddy look at
                 * them with group lock_held
                 */
+               if (test_opt(sb, DISCARD))
+                       ext4_issue_discard(sb, block_group, bit, count);
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-               if (test_opt(sb, DISCARD))
-                       ext4_issue_discard(sb, block_group, bit, count);
        }
  
        ret = ext4_free_blks_count(sb, gdp) + count;
@@@ -4686,7 -4684,190 +4685,190 @@@ error_return
                dquot_free_block(inode, freed);
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
-       if (ac)
-               kmem_cache_free(ext4_ac_cachep, ac);
        return;
  }
+ /**
+  * ext4_trim_extent -- function to TRIM one single free extent in the group
+  * @sb:               super block for the file system
+  * @start:    starting block of the free extent in the alloc. group
+  * @count:    number of blocks to TRIM
+  * @group:    alloc. group we are working with
+  * @e4b:      ext4 buddy for the group
+  *
+  * Trim "count" blocks starting at "start" in the "group". To assure that no
+  * one will allocate those blocks, mark it as used in buddy bitmap. This must
+  * be called with under the group lock.
+  */
+ static int ext4_trim_extent(struct super_block *sb, int start, int count,
+               ext4_group_t group, struct ext4_buddy *e4b)
+ {
+       struct ext4_free_extent ex;
+       int ret = 0;
+       assert_spin_locked(ext4_group_lock_ptr(sb, group));
+       ex.fe_start = start;
+       ex.fe_group = group;
+       ex.fe_len = count;
+       /*
+        * Mark blocks used, so no one can reuse them while
+        * being trimmed.
+        */
+       mb_mark_used(e4b, &ex);
+       ext4_unlock_group(sb, group);
+       ret = ext4_issue_discard(sb, group, start, count);
+       if (ret)
+               ext4_std_error(sb, ret);
+       ext4_lock_group(sb, group);
+       mb_free_blocks(NULL, e4b, start, ex.fe_len);
+       return ret;
+ }
+ /**
+  * ext4_trim_all_free -- function to trim all free space in alloc. group
+  * @sb:                       super block for file system
+  * @e4b:              ext4 buddy
+  * @start:            first group block to examine
+  * @max:              last group block to examine
+  * @minblocks:                minimum extent block count
+  *
+  * ext4_trim_all_free walks through group's buddy bitmap searching for free
+  * extents. When the free block is found, ext4_trim_extent is called to TRIM
+  * the extent.
+  *
+  *
+  * ext4_trim_all_free walks through group's block bitmap searching for free
+  * extents. When the free extent is found, mark it as used in group buddy
+  * bitmap. Then issue a TRIM command on this extent and free the extent in
+  * the group buddy bitmap. This is done until whole group is scanned.
+  */
+ ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+               ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+ {
+       void *bitmap;
+       ext4_grpblk_t next, count = 0;
+       ext4_group_t group;
+       int ret = 0;
+       BUG_ON(e4b == NULL);
+       bitmap = e4b->bd_bitmap;
+       group = e4b->bd_group;
+       start = (e4b->bd_info->bb_first_free > start) ?
+               e4b->bd_info->bb_first_free : start;
+       ext4_lock_group(sb, group);
+       while (start < max) {
+               start = mb_find_next_zero_bit(bitmap, max, start);
+               if (start >= max)
+                       break;
+               next = mb_find_next_bit(bitmap, max, start);
+               if ((next - start) >= minblocks) {
+                       ret = ext4_trim_extent(sb, start,
+                               next - start, group, e4b);
+                       if (ret < 0)
+                               break;
+                       count += next - start;
+               }
+               start = next + 1;
+               if (fatal_signal_pending(current)) {
+                       count = -ERESTARTSYS;
+                       break;
+               }
+               if (need_resched()) {
+                       ext4_unlock_group(sb, group);
+                       cond_resched();
+                       ext4_lock_group(sb, group);
+               }
+               if ((e4b->bd_info->bb_free - count) < minblocks)
+                       break;
+       }
+       ext4_unlock_group(sb, group);
+       ext4_debug("trimmed %d blocks in the group %d\n",
+               count, group);
+       if (ret < 0)
+               count = ret;
+       return count;
+ }
+ /**
+  * ext4_trim_fs() -- trim ioctl handle function
+  * @sb:                       superblock for filesystem
+  * @range:            fstrim_range structure
+  *
+  * start:     First Byte to trim
+  * len:               number of Bytes to trim from start
+  * minlen:    minimum extent length in Bytes
+  * ext4_trim_fs goes through all allocation groups containing Bytes from
+  * start to start+len. For each such a group ext4_trim_all_free function
+  * is invoked to trim all free space.
+  */
+ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+ {
+       struct ext4_buddy e4b;
+       ext4_group_t first_group, last_group;
+       ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+       ext4_grpblk_t cnt = 0, first_block, last_block;
+       uint64_t start, len, minlen, trimmed;
+       int ret = 0;
+       start = range->start >> sb->s_blocksize_bits;
+       len = range->len >> sb->s_blocksize_bits;
+       minlen = range->minlen >> sb->s_blocksize_bits;
+       trimmed = 0;
+       if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+               return -EINVAL;
+       /* Determine first and last group to examine based on start and len */
+       ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+                                    &first_group, &first_block);
+       ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+                                    &last_group, &last_block);
+       last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+       last_block = EXT4_BLOCKS_PER_GROUP(sb);
+       if (first_group > last_group)
+               return -EINVAL;
+       for (group = first_group; group <= last_group; group++) {
+               ret = ext4_mb_load_buddy(sb, group, &e4b);
+               if (ret) {
+                       ext4_error(sb, "Error in loading buddy "
+                                       "information for %u", group);
+                       break;
+               }
+               if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+                       len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+               else
+                       last_block = len;
+               if (e4b.bd_info->bb_free >= minlen) {
+                       cnt = ext4_trim_all_free(sb, &e4b, first_block,
+                                               last_block, minlen);
+                       if (cnt < 0) {
+                               ret = cnt;
+                               ext4_mb_unload_buddy(&e4b);
+                               break;
+                       }
+               }
+               ext4_mb_unload_buddy(&e4b);
+               trimmed += cnt;
+               first_block = 0;
+       }
+       range->len = trimmed * sb->s_blocksize;
+       return ret;
+ }
diff --combined fs/ext4/namei.c
@@@ -856,6 -856,7 +856,7 @@@ static struct buffer_head * ext4_find_e
        struct buffer_head *bh_use[NAMEI_RA_SIZE];
        struct buffer_head *bh, *ret = NULL;
        ext4_lblk_t start, block, b;
+       const u8 *name = d_name->name;
        int ra_max = 0;         /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        int ra_ptr = 0;         /* Current index into readahead
        namelen = d_name->len;
        if (namelen > EXT4_NAME_LEN)
                return NULL;
+       if ((namelen <= 2) && (name[0] == '.') &&
+           (name[1] == '.' || name[1] == '0')) {
+               /*
+                * "." or ".." will only be in the first block
+                * NFS may look up ".."; "." should be handled by the VFS
+                */
+               block = start = 0;
+               nblocks = 1;
+               goto restart;
+       }
        if (is_dx(dir)) {
                bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
                /*
@@@ -960,55 -971,35 +971,35 @@@ cleanup_and_exit
  static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
                       struct ext4_dir_entry_2 **res_dir, int *err)
  {
-       struct super_block * sb;
+       struct super_block * sb = dir->i_sb;
        struct dx_hash_info     hinfo;
-       u32 hash;
        struct dx_frame frames[2], *frame;
-       struct ext4_dir_entry_2 *de, *top;
        struct buffer_head *bh;
        ext4_lblk_t block;
        int retval;
-       int namelen = d_name->len;
-       const u8 *name = d_name->name;
  
-       sb = dir->i_sb;
-       /* NFS may look up ".." - look at dx_root directory block */
-       if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-               if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-                       return NULL;
-       } else {
-               frame = frames;
-               frame->bh = NULL;                       /* for dx_release() */
-               frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
-               dx_set_block(frame->at, 0);             /* dx_root block is 0 */
-       }
-       hash = hinfo.hash;
+       if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
+               return NULL;
        do {
                block = dx_get_block(frame->at);
-               if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+               if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
                        goto errout;
-               de = (struct ext4_dir_entry_2 *) bh->b_data;
-               top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-                                      EXT4_DIR_REC_LEN(0));
-               for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
-                       int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
-                                 + ((char *) de - bh->b_data);
-                       if (!ext4_check_dir_entry(dir, de, bh, off)) {
-                               brelse(bh);
-                               *err = ERR_BAD_DX_DIR;
-                               goto errout;
-                       }
  
-                       if (ext4_match(namelen, name, de)) {
-                               *res_dir = de;
-                               dx_release(frames);
-                               return bh;
-                       }
+               retval = search_dirblock(bh, dir, d_name,
+                                        block << EXT4_BLOCK_SIZE_BITS(sb),
+                                        res_dir);
+               if (retval == 1) {      /* Success! */
+                       dx_release(frames);
+                       return bh;
                }
                brelse(bh);
+               if (retval == -1) {
+                       *err = ERR_BAD_DX_DIR;
+                       goto errout;
+               }
                /* Check to see if we should continue to search */
-               retval = ext4_htree_next_block(dir, hash, frame,
+               retval = ext4_htree_next_block(dir, hinfo.hash, frame,
                                               frames, NULL);
                if (retval < 0) {
                        ext4_warning(sb,
@@@ -2312,7 -2303,7 +2303,7 @@@ retry
  
        inode->i_ctime = ext4_current_time(inode);
        ext4_inc_count(handle, inode);
 -      atomic_inc(&inode->i_count);
 +      ihold(inode);
  
        err = ext4_add_entry(handle, dentry, inode);
        if (!err) {
diff --combined fs/ext4/resize.c
@@@ -226,23 -226,13 +226,13 @@@ static int setup_new_group_blocks(struc
        }
  
        /* Zero out all of the reserved backup group descriptor table blocks */
-       for (i = 0, bit = gdblocks + 1, block = start + bit;
-            i < reserved_gdb; i++, block++, bit++) {
-               struct buffer_head *gdb;
-               ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
-               if ((err = extend_or_restart_transaction(handle, 1, bh)))
-                       goto exit_bh;
+       ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+                       block, sbi->s_itb_per_group);
+       err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
 -                             GFP_NOFS, BLKDEV_IFL_WAIT);
++                             GFP_NOFS);
+       if (err)
+               goto exit_bh;
  
-               if (IS_ERR(gdb = bclean(handle, sb, block))) {
-                       err = PTR_ERR(gdb);
-                       goto exit_bh;
-               }
-               ext4_handle_dirty_metadata(handle, NULL, gdb);
-               ext4_set_bit(bit, bh->b_data);
-               brelse(gdb);
-       }
        ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
                   input->block_bitmap - start);
        ext4_set_bit(input->block_bitmap - start, bh->b_data);
        ext4_set_bit(input->inode_bitmap - start, bh->b_data);
  
        /* Zero out all of the inode table blocks */
-       for (i = 0, block = input->inode_table, bit = block - start;
-            i < sbi->s_itb_per_group; i++, bit++, block++) {
-               struct buffer_head *it;
-               ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
-               if ((err = extend_or_restart_transaction(handle, 1, bh)))
-                       goto exit_bh;
-               if (IS_ERR(it = bclean(handle, sb, block))) {
-                       err = PTR_ERR(it);
-                       goto exit_bh;
-               }
-               ext4_handle_dirty_metadata(handle, NULL, it);
-               brelse(it);
-               ext4_set_bit(bit, bh->b_data);
-       }
+       block = input->inode_table;
+       ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+                       block, sbi->s_itb_per_group);
 -      err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
 -                             GFP_NOFS, BLKDEV_IFL_WAIT);
++      err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
+       if (err)
+               goto exit_bh;
  
        if ((err = extend_or_restart_transaction(handle, 2, bh)))
                goto exit_bh;
  
-       mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
+       ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
+                            bh->b_data);
        ext4_handle_dirty_metadata(handle, NULL, bh);
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
                goto exit_journal;
        }
  
-       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
-                       bh->b_data);
+       ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+                            bh->b_data);
        ext4_handle_dirty_metadata(handle, NULL, bh);
  exit_bh:
        brelse(bh);
diff --combined fs/ext4/super.c
@@@ -26,6 -26,7 +26,6 @@@
  #include <linux/init.h>
  #include <linux/blkdev.h>
  #include <linux/parser.h>
 -#include <linux/smp_lock.h>
  #include <linux/buffer_head.h>
  #include <linux/exportfs.h>
  #include <linux/vfs.h>
@@@ -40,6 -41,9 +40,9 @@@
  #include <linux/crc16.h>
  #include <asm/uaccess.h>
  
+ #include <linux/kthread.h>
+ #include <linux/freezer.h>
  #include "ext4.h"
  #include "ext4_jbd2.h"
  #include "xattr.h"
  #define CREATE_TRACE_POINTS
  #include <trace/events/ext4.h>
  
- struct proc_dir_entry *ext4_proc_root;
+ static struct proc_dir_entry *ext4_proc_root;
  static struct kset *ext4_kset;
+ struct ext4_lazy_init *ext4_li_info;
+ struct mutex ext4_li_mtx;
+ struct ext4_features *ext4_feat;
  
  static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
@@@ -69,6 -76,8 +75,8 @@@ static void ext4_write_super(struct sup
  static int ext4_freeze(struct super_block *sb);
  static int ext4_get_sb(struct file_system_type *fs_type, int flags,
                       const char *dev_name, void *data, struct vfsmount *mnt);
+ static void ext4_destroy_lazyinit_thread(void);
+ static void ext4_unregister_li_request(struct super_block *sb);
  
  #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
  static struct file_system_type ext3_fs_type = {
@@@ -701,12 -710,14 +709,13 @@@ static void ext4_put_super(struct super
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
  
+       ext4_unregister_li_request(sb);
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
  
        flush_workqueue(sbi->dio_unwritten_wq);
        destroy_workqueue(sbi->dio_unwritten_wq);
  
        lock_super(sb);
 -      lock_kernel();
        if (sb->s_dirt)
                ext4_commit_super(sb, 1);
  
                        ext4_abort(sb, "Couldn't clean up the journal");
        }
  
+       del_timer(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);
         * Now that we are completely done shutting down the
         * superblock, we need to actually destroy the kobject.
         */
 -      unlock_kernel();
        unlock_super(sb);
        kobject_put(&sbi->s_kobj);
        wait_for_completion(&sbi->s_kobj_unregister);
@@@ -1042,6 -1055,12 +1052,12 @@@ static int ext4_show_options(struct seq
            !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
                seq_puts(seq, ",block_validity");
  
+       if (!test_opt(sb, INIT_INODE_TABLE))
+               seq_puts(seq, ",noinit_inode_table");
+       else if (sbi->s_li_wait_mult)
+               seq_printf(seq, ",init_inode_table=%u",
+                          (unsigned) sbi->s_li_wait_mult);
        ext4_show_quota_options(seq, sb);
  
        return 0;
@@@ -1170,6 -1189,7 +1186,7 @@@ static const struct super_operations ex
        .quota_write    = ext4_quota_write,
  #endif
        .bdev_try_to_free_page = bdev_try_to_free_page,
+       .trim_fs        = ext4_trim_fs
  };
  
  static const struct super_operations ext4_nojournal_sops = {
@@@ -1216,6 -1236,7 +1233,7 @@@ enum 
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard,
+       Opt_init_inode_table, Opt_noinit_inode_table,
  };
  
  static const match_table_t tokens = {
        {Opt_dioread_lock, "dioread_lock"},
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
+       {Opt_init_inode_table, "init_itable=%u"},
+       {Opt_init_inode_table, "init_itable"},
+       {Opt_noinit_inode_table, "noinit_itable"},
        {Opt_err, NULL},
  };
  
@@@ -1756,6 -1780,20 +1777,20 @@@ set_qf_format
                case Opt_dioread_lock:
                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
                        break;
+               case Opt_init_inode_table:
+                       set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                       if (args[0].from) {
+                               if (match_int(&args[0], &option))
+                                       return 0;
+                       } else
+                               option = EXT4_DEF_LI_WAIT_MULT;
+                       if (option < 0)
+                               return 0;
+                       sbi->s_li_wait_mult = option;
+                       break;
+               case Opt_noinit_inode_table:
+                       clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                       break;
                default:
                        ext4_msg(sb, KERN_ERR,
                               "Unrecognized mount option \"%s\" "
@@@ -1939,7 -1977,8 +1974,8 @@@ int ext4_group_desc_csum_verify(struct 
  }
  
  /* Called at mount-time, super-block is locked */
- static int ext4_check_descriptors(struct super_block *sb)
+ static int ext4_check_descriptors(struct super_block *sb,
+                                 ext4_group_t *first_not_zeroed)
  {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
        ext4_fsblk_t inode_bitmap;
        ext4_fsblk_t inode_table;
        int flexbg_flag = 0;
-       ext4_group_t i;
+       ext4_group_t i, grp = sbi->s_groups_count;
  
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                flexbg_flag = 1;
                        last_block = first_block +
                                (EXT4_BLOCKS_PER_GROUP(sb) - 1);
  
+               if ((grp == sbi->s_groups_count) &&
+                  !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                       grp = i;
                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
+       if (NULL != first_not_zeroed)
+               *first_not_zeroed = grp;
  
        ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
        sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@@ -2373,6 -2418,7 +2415,7 @@@ static struct ext4_attr ext4_attr_##_na
  #define EXT4_ATTR(name, mode, show, store) \
  static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
  
+ #define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
  #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
  #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
  #define EXT4_RW_ATTR_SBI_UI(name, elname)     \
@@@ -2409,6 -2455,16 +2452,16 @@@ static struct attribute *ext4_attrs[] 
        NULL,
  };
  
+ /* Features this copy of ext4 supports */
+ EXT4_INFO_ATTR(lazy_itable_init);
+ EXT4_INFO_ATTR(batched_discard);
+ static struct attribute *ext4_feat_attrs[] = {
+       ATTR_LIST(lazy_itable_init),
+       ATTR_LIST(batched_discard),
+       NULL,
+ };
  static ssize_t ext4_attr_show(struct kobject *kobj,
                              struct attribute *attr, char *buf)
  {
@@@ -2437,7 -2493,6 +2490,6 @@@ static void ext4_sb_release(struct kobj
        complete(&sbi->s_kobj_unregister);
  }
  
  static const struct sysfs_ops ext4_attr_ops = {
        .show   = ext4_attr_show,
        .store  = ext4_attr_store,
@@@ -2449,6 -2504,17 +2501,17 @@@ static struct kobj_type ext4_ktype = 
        .release        = ext4_sb_release,
  };
  
+ static void ext4_feat_release(struct kobject *kobj)
+ {
+       complete(&ext4_feat->f_kobj_unregister);
+ }
+ static struct kobj_type ext4_feat_ktype = {
+       .default_attrs  = ext4_feat_attrs,
+       .sysfs_ops      = &ext4_attr_ops,
+       .release        = ext4_feat_release,
+ };
  /*
   * Check whether this filesystem can be mounted based on
   * the features present and the RDONLY/RDWR mount requested.
@@@ -2539,6 -2605,372 +2602,372 @@@ static void print_daily_error_info(unsi
        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
  }
  
+ static void ext4_lazyinode_timeout(unsigned long data)
+ {
+       struct task_struct *p = (struct task_struct *)data;
+       wake_up_process(p);
+ }
+ /* Find next suitable group and run ext4_init_inode_table */
+ static int ext4_run_li_request(struct ext4_li_request *elr)
+ {
+       struct ext4_group_desc *gdp = NULL;
+       ext4_group_t group, ngroups;
+       struct super_block *sb;
+       unsigned long timeout = 0;
+       int ret = 0;
+       sb = elr->lr_super;
+       ngroups = EXT4_SB(sb)->s_groups_count;
+       for (group = elr->lr_next_group; group < ngroups; group++) {
+               gdp = ext4_get_group_desc(sb, group, NULL);
+               if (!gdp) {
+                       ret = 1;
+                       break;
+               }
+               if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                       break;
+       }
+       if (group == ngroups)
+               ret = 1;
+       if (!ret) {
+               timeout = jiffies;
+               ret = ext4_init_inode_table(sb, group,
+                                           elr->lr_timeout ? 0 : 1);
+               if (elr->lr_timeout == 0) {
+                       timeout = jiffies - timeout;
+                       if (elr->lr_sbi->s_li_wait_mult)
+                               timeout *= elr->lr_sbi->s_li_wait_mult;
+                       else
+                               timeout *= 20;
+                       elr->lr_timeout = timeout;
+               }
+               elr->lr_next_sched = jiffies + elr->lr_timeout;
+               elr->lr_next_group = group + 1;
+       }
+       return ret;
+ }
+ /*
+  * Remove lr_request from the list_request and free the
+  * request tructure. Should be called with li_list_mtx held
+  */
+ static void ext4_remove_li_request(struct ext4_li_request *elr)
+ {
+       struct ext4_sb_info *sbi;
+       if (!elr)
+               return;
+       sbi = elr->lr_sbi;
+       list_del(&elr->lr_request);
+       sbi->s_li_request = NULL;
+       kfree(elr);
+ }
+ static void ext4_unregister_li_request(struct super_block *sb)
+ {
+       struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
+       if (!ext4_li_info)
+               return;
+       mutex_lock(&ext4_li_info->li_list_mtx);
+       ext4_remove_li_request(elr);
+       mutex_unlock(&ext4_li_info->li_list_mtx);
+ }
+ /*
+  * This is the function where ext4lazyinit thread lives. It walks
+  * through the request list searching for next scheduled filesystem.
+  * When such a fs is found, run the lazy initialization request
+  * (ext4_rn_li_request) and keep track of the time spend in this
+  * function. Based on that time we compute next schedule time of
+  * the request. When walking through the list is complete, compute
+  * next waking time and put itself into sleep.
+  */
+ static int ext4_lazyinit_thread(void *arg)
+ {
+       struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+       struct list_head *pos, *n;
+       struct ext4_li_request *elr;
+       unsigned long next_wakeup;
+       DEFINE_WAIT(wait);
+       int ret;
+       BUG_ON(NULL == eli);
+       eli->li_timer.data = (unsigned long)current;
+       eli->li_timer.function = ext4_lazyinode_timeout;
+       eli->li_task = current;
+       wake_up(&eli->li_wait_task);
+ cont_thread:
+       while (true) {
+               next_wakeup = MAX_JIFFY_OFFSET;
+               mutex_lock(&eli->li_list_mtx);
+               if (list_empty(&eli->li_request_list)) {
+                       mutex_unlock(&eli->li_list_mtx);
+                       goto exit_thread;
+               }
+               list_for_each_safe(pos, n, &eli->li_request_list) {
+                       elr = list_entry(pos, struct ext4_li_request,
+                                        lr_request);
+                       if (time_after_eq(jiffies, elr->lr_next_sched))
+                               ret = ext4_run_li_request(elr);
+                       if (ret) {
+                               ret = 0;
+                               ext4_remove_li_request(elr);
+                               continue;
+                       }
+                       if (time_before(elr->lr_next_sched, next_wakeup))
+                               next_wakeup = elr->lr_next_sched;
+               }
+               mutex_unlock(&eli->li_list_mtx);
+               if (freezing(current))
+                       refrigerator();
+               if (time_after_eq(jiffies, next_wakeup)) {
+                       cond_resched();
+                       continue;
+               }
+               eli->li_timer.expires = next_wakeup;
+               add_timer(&eli->li_timer);
+               prepare_to_wait(&eli->li_wait_daemon, &wait,
+                               TASK_INTERRUPTIBLE);
+               if (time_before(jiffies, next_wakeup))
+                       schedule();
+               finish_wait(&eli->li_wait_daemon, &wait);
+       }
+ exit_thread:
+       /*
+        * It looks like the request list is empty, but we need
+        * to check it under the li_list_mtx lock, to prevent any
+        * additions into it, and of course we should lock ext4_li_mtx
+        * to atomically free the list and ext4_li_info, because at
+        * this point another ext4 filesystem could be registering
+        * new one.
+        */
+       mutex_lock(&ext4_li_mtx);
+       mutex_lock(&eli->li_list_mtx);
+       if (!list_empty(&eli->li_request_list)) {
+               mutex_unlock(&eli->li_list_mtx);
+               mutex_unlock(&ext4_li_mtx);
+               goto cont_thread;
+       }
+       mutex_unlock(&eli->li_list_mtx);
+       del_timer_sync(&ext4_li_info->li_timer);
+       eli->li_task = NULL;
+       wake_up(&eli->li_wait_task);
+       kfree(ext4_li_info);
+       ext4_li_info = NULL;
+       mutex_unlock(&ext4_li_mtx);
+       return 0;
+ }
+ static void ext4_clear_request_list(void)
+ {
+       struct list_head *pos, *n;
+       struct ext4_li_request *elr;
+       mutex_lock(&ext4_li_info->li_list_mtx);
+       if (list_empty(&ext4_li_info->li_request_list))
+               return;
+       list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+               elr = list_entry(pos, struct ext4_li_request,
+                                lr_request);
+               ext4_remove_li_request(elr);
+       }
+       mutex_unlock(&ext4_li_info->li_list_mtx);
+ }
+ static int ext4_run_lazyinit_thread(void)
+ {
+       struct task_struct *t;
+       t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+       if (IS_ERR(t)) {
+               int err = PTR_ERR(t);
+               ext4_clear_request_list();
+               del_timer_sync(&ext4_li_info->li_timer);
+               kfree(ext4_li_info);
+               ext4_li_info = NULL;
+               printk(KERN_CRIT "EXT4: error %d creating inode table "
+                                "initialization thread\n",
+                                err);
+               return err;
+       }
+       ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+       wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
+       return 0;
+ }
+ /*
+  * Check whether it make sense to run itable init. thread or not.
+  * If there is at least one uninitialized inode table, return
+  * corresponding group number, else the loop goes through all
+  * groups and return total number of groups.
+  */
+ static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+ {
+       ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+       struct ext4_group_desc *gdp = NULL;
+       for (group = 0; group < ngroups; group++) {
+               gdp = ext4_get_group_desc(sb, group, NULL);
+               if (!gdp)
+                       continue;
+               if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                       break;
+       }
+       return group;
+ }
+ static int ext4_li_info_new(void)
+ {
+       struct ext4_lazy_init *eli = NULL;
+       eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+       if (!eli)
+               return -ENOMEM;
+       eli->li_task = NULL;
+       INIT_LIST_HEAD(&eli->li_request_list);
+       mutex_init(&eli->li_list_mtx);
+       init_waitqueue_head(&eli->li_wait_daemon);
+       init_waitqueue_head(&eli->li_wait_task);
+       init_timer(&eli->li_timer);
+       eli->li_state |= EXT4_LAZYINIT_QUIT;
+       ext4_li_info = eli;
+       return 0;
+ }
+ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+                                           ext4_group_t start)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_li_request *elr;
+       unsigned long rnd;
+       elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+       if (!elr)
+               return NULL;
+       elr->lr_super = sb;
+       elr->lr_sbi = sbi;
+       elr->lr_next_group = start;
+       /*
+        * Randomize first schedule time of the request to
+        * spread the inode table initialization requests
+        * better.
+        */
+       get_random_bytes(&rnd, sizeof(rnd));
+       elr->lr_next_sched = jiffies + (unsigned long)rnd %
+                            (EXT4_DEF_LI_MAX_START_DELAY * HZ);
+       return elr;
+ }
+ static int ext4_register_li_request(struct super_block *sb,
+                                   ext4_group_t first_not_zeroed)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_li_request *elr;
+       ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+       int ret;
+       if (sbi->s_li_request != NULL)
+               return 0;
+       if (first_not_zeroed == ngroups ||
+           (sb->s_flags & MS_RDONLY) ||
+           !test_opt(sb, INIT_INODE_TABLE)) {
+               sbi->s_li_request = NULL;
+               return 0;
+       }
+       if (first_not_zeroed == ngroups) {
+               sbi->s_li_request = NULL;
+               return 0;
+       }
+       elr = ext4_li_request_new(sb, first_not_zeroed);
+       if (!elr)
+               return -ENOMEM;
+       mutex_lock(&ext4_li_mtx);
+       if (NULL == ext4_li_info) {
+               ret = ext4_li_info_new();
+               if (ret)
+                       goto out;
+       }
+       mutex_lock(&ext4_li_info->li_list_mtx);
+       list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+       mutex_unlock(&ext4_li_info->li_list_mtx);
+       sbi->s_li_request = elr;
+       if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+               ret = ext4_run_lazyinit_thread();
+               if (ret)
+                       goto out;
+       }
+ out:
+       mutex_unlock(&ext4_li_mtx);
+       if (ret)
+               kfree(elr);
+       return ret;
+ }
+ /*
+  * We do not need to lock anything since this is called on
+  * module unload.
+  */
+ static void ext4_destroy_lazyinit_thread(void)
+ {
+       /*
+        * If thread exited earlier
+        * there's nothing to be done.
+        */
+       if (!ext4_li_info)
+               return;
+       ext4_clear_request_list();
+       while (ext4_li_info->li_task) {
+               wake_up(&ext4_li_info->li_wait_daemon);
+               wait_event(ext4_li_info->li_wait_task,
+                          ext4_li_info->li_task == NULL);
+       }
+ }
  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
        __u64 blocks_count;
        int err;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+       ext4_group_t first_not_zeroed;
  
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                sbi->s_sectors_written_start =
                        part_stat_read(sb->s_bdev->bd_part, sectors[1]);
  
 -      unlock_kernel();
 -
        /* Cleanup superblock name */
        for (cp = sb->s_id; (cp = strchr(cp, '/'));)
                *cp = '!';
  
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+       set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
                set_opt(sbi->s_mount_opt, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
         * Test whether we have more sectors than will fit in sector_t,
         * and whether the max offset is addressable by the page cache.
         */
 -      if ((ext4_blocks_count(es) >
 -           (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
 -          (ext4_blocks_count(es) >
 -           (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
 +      ret = generic_check_addressable(sb->s_blocksize_bits,
 +                                      ext4_blocks_count(es));
 +      if (ret) {
                ext4_msg(sb, KERN_ERR, "filesystem"
                         " too large to mount safely on this system");
                if (sizeof(sector_t) < 8)
                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
 -              ret = -EFBIG;
                goto failed_mount;
        }
  
                        goto failed_mount2;
                }
        }
-       if (!ext4_check_descriptors(sb)) {
+       if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                goto failed_mount2;
        }
@@@ -3122,6 -3560,10 +3553,10 @@@ no_journal
                goto failed_mount4;
        }
  
+       err = ext4_register_li_request(sb, first_not_zeroed);
+       if (err)
+               goto failed_mount4;
        sbi->s_kobj.kset = ext4_kset;
        init_completion(&sbi->s_kobj_unregister);
        err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
  
 -      lock_kernel();
        kfree(orig_data);
        return 0;
  
@@@ -3205,6 -3648,7 +3640,6 @@@ out_fail
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
 -      lock_kernel();
  out_free_orig:
        kfree(orig_data);
        return ret;
@@@ -3461,7 -3905,7 +3896,7 @@@ static int ext4_load_journal(struct sup
        EXT4_SB(sb)->s_journal = journal;
        ext4_clear_journal_err(sb, es);
  
-       if (journal_devnum &&
+       if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
  
@@@ -3514,9 -3958,12 +3949,12 @@@ static int ext4_commit_super(struct sup
        else
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-       ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+       if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter))
+               ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeblocks_counter));
-       es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+       if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
+               es->s_free_inodes_count =
+                       cpu_to_le32(percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeinodes_counter));
        sb->s_dirt = 0;
        BUFFER_TRACE(sbh, "marking dirty");
@@@ -3713,6 -4160,8 +4151,6 @@@ static int ext4_remount(struct super_bl
  #endif
        char *orig_data = kstrdup(data, GFP_KERNEL);
  
 -      lock_kernel();
 -
        /* Store the original options */
        lock_super(sb);
        old_sb_flags = sb->s_flags;
                        enable_quota = 1;
                }
        }
+       /*
+        * Reinitialize lazy itable initialization thread based on
+        * current settings
+        */
+       if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+               ext4_unregister_li_request(sb);
+       else {
+               ext4_group_t first_not_zeroed;
+               first_not_zeroed = ext4_has_uninit_itable(sb);
+               ext4_register_li_request(sb, first_not_zeroed);
+       }
        ext4_setup_system_zone(sb);
        if (sbi->s_journal == NULL)
                ext4_commit_super(sb, 1);
                        kfree(old_opts.s_qf_names[i]);
  #endif
        unlock_super(sb);
 -      unlock_kernel();
        if (enable_quota)
                dquot_resume(sb, -1);
  
@@@ -3872,6 -4335,7 +4323,6 @@@ restore_opts
        }
  #endif
        unlock_super(sb);
 -      unlock_kernel();
        kfree(orig_data);
        return err;
  }
@@@ -4276,23 -4740,53 +4727,53 @@@ static struct file_system_type ext4_fs_
        .fs_flags       = FS_REQUIRES_DEV,
  };
  
- static int __init init_ext4_fs(void)
+ int __init ext4_init_feat_adverts(void)
+ {
+       struct ext4_features *ef;
+       int ret = -ENOMEM;
+       ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+       if (!ef)
+               goto out;
+       ef->f_kobj.kset = ext4_kset;
+       init_completion(&ef->f_kobj_unregister);
+       ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+                                  "features");
+       if (ret) {
+               kfree(ef);
+               goto out;
+       }
+       ext4_feat = ef;
+       ret = 0;
+ out:
+       return ret;
+ }
+ static int __init ext4_init_fs(void)
  {
        int err;
  
        ext4_check_flag_values();
-       err = init_ext4_system_zone();
+       err = ext4_init_pageio();
        if (err)
                return err;
+       err = ext4_init_system_zone();
+       if (err)
+               goto out5;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
                goto out4;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
-       err = init_ext4_mballoc();
+       err = ext4_init_feat_adverts();
+       err = ext4_init_mballoc();
        if (err)
                goto out3;
  
-       err = init_ext4_xattr();
+       err = ext4_init_xattr();
        if (err)
                goto out2;
        err = init_inodecache();
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;
+       ext4_li_info = NULL;
+       mutex_init(&ext4_li_mtx);
        return 0;
  out:
        unregister_as_ext2();
        unregister_as_ext3();
        destroy_inodecache();
  out1:
-       exit_ext4_xattr();
+       ext4_exit_xattr();
  out2:
-       exit_ext4_mballoc();
+       ext4_exit_mballoc();
  out3:
+       kfree(ext4_feat);
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
  out4:
-       exit_ext4_system_zone();
+       ext4_exit_system_zone();
+ out5:
+       ext4_exit_pageio();
        return err;
  }
  
- static void __exit exit_ext4_fs(void)
+ static void __exit ext4_exit_fs(void)
  {
+       ext4_destroy_lazyinit_thread();
        unregister_as_ext2();
        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        destroy_inodecache();
-       exit_ext4_xattr();
-       exit_ext4_mballoc();
+       ext4_exit_xattr();
+       ext4_exit_mballoc();
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
-       exit_ext4_system_zone();
+       ext4_exit_system_zone();
+       ext4_exit_pageio();
  }
  
  MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
  MODULE_DESCRIPTION("Fourth Extended Filesystem");
  MODULE_LICENSE("GPL");
- module_init(init_ext4_fs)
- module_exit(exit_ext4_fs)
+ module_init(ext4_init_fs)
+ module_exit(ext4_exit_fs)
diff --combined fs/jbd2/checkpoint.c
@@@ -299,6 -299,16 +299,16 @@@ static int __process_buffer(journal_t *
                transaction->t_chp_stats.cs_forced_to_close++;
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
+               if (unlikely(journal->j_flags & JBD2_UNMOUNT))
+                       /*
+                        * The journal thread is dead; so starting and
+                        * waiting for a commit to finish will cause
+                        * us to wait for a _very_ long time.
+                        */
+                       printk(KERN_ERR "JBD2: %s: "
+                              "Waiting for Godot: block %llu\n",
+                              journal->j_devname,
+                              (unsigned long long) bh->b_blocknr);
                jbd2_log_start_commit(journal, tid);
                jbd2_log_wait_commit(journal, tid);
                ret = 1;
@@@ -532,7 -542,8 +542,7 @@@ int jbd2_cleanup_journal_tail(journal_
         */
        if ((journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
 -              blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
 -                      BLKDEV_IFL_WAIT);
 +              blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
        if (!(journal->j_flags & JBD2_ABORT))
                jbd2_journal_update_superblock(journal, 1);
        return 0;
diff --combined fs/jbd2/commit.c
@@@ -26,7 -26,9 +26,9 @@@
  #include <linux/backing-dev.h>
  #include <linux/bio.h>
  #include <linux/blkdev.h>
+ #include <linux/bitops.h>
  #include <trace/events/jbd2.h>
+ #include <asm/system.h>
  
  /*
   * Default IO end handler for temporary BJ_IO buffer_heads.
@@@ -134,11 -136,25 +136,11 @@@ static int journal_submit_commit_record
  
        if (journal->j_flags & JBD2_BARRIER &&
            !JBD2_HAS_INCOMPAT_FEATURE(journal,
 -                                     JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 -              ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
 -              if (ret == -EOPNOTSUPP) {
 -                      printk(KERN_WARNING
 -                             "JBD2: Disabling barriers on %s, "
 -                             "not supported by device\n", journal->j_devname);
 -                      write_lock(&journal->j_state_lock);
 -                      journal->j_flags &= ~JBD2_BARRIER;
 -                      write_unlock(&journal->j_state_lock);
 -
 -                      /* And try again, without the barrier */
 -                      lock_buffer(bh);
 -                      set_buffer_uptodate(bh);
 -                      clear_buffer_dirty(bh);
 -                      ret = submit_bh(WRITE_SYNC_PLUG, bh);
 -              }
 -      } else {
 +                                     JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
 +              ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
 +      else
                ret = submit_bh(WRITE_SYNC_PLUG, bh);
 -      }
 +
        *cbh = bh;
        return ret;
  }
@@@ -152,8 -168,29 +154,8 @@@ static int journal_wait_on_commit_recor
  {
        int ret = 0;
  
 -retry:
        clear_buffer_dirty(bh);
        wait_on_buffer(bh);
 -      if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
 -              printk(KERN_WARNING
 -                     "JBD2: %s: disabling barries on %s - not supported "
 -                     "by device\n", __func__, journal->j_devname);
 -              write_lock(&journal->j_state_lock);
 -              journal->j_flags &= ~JBD2_BARRIER;
 -              write_unlock(&journal->j_state_lock);
 -
 -              lock_buffer(bh);
 -              clear_buffer_dirty(bh);
 -              set_buffer_uptodate(bh);
 -              bh->b_end_io = journal_end_buffer_io_sync;
 -
 -              ret = submit_bh(WRITE_SYNC_PLUG, bh);
 -              if (ret) {
 -                      unlock_buffer(bh);
 -                      return ret;
 -              }
 -              goto retry;
 -      }
  
        if (unlikely(!buffer_uptodate(bh)))
                ret = -EIO;
@@@ -201,7 -238,7 +203,7 @@@ static int journal_submit_data_buffers(
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
                mapping = jinode->i_vfs_inode->i_mapping;
-               jinode->i_flags |= JI_COMMIT_RUNNING;
+               set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                spin_unlock(&journal->j_list_lock);
                /*
                 * submit the inode data buffers. We use writepage
                spin_lock(&journal->j_list_lock);
                J_ASSERT(jinode->i_transaction == commit_transaction);
                commit_transaction->t_flushed_data_blocks = 1;
-               jinode->i_flags &= ~JI_COMMIT_RUNNING;
+               clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+               smp_mb__after_clear_bit();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
        spin_unlock(&journal->j_list_lock);
@@@ -237,7 -275,7 +240,7 @@@ static int journal_finish_inode_data_bu
        /* For locking, see the comment in journal_submit_data_buffers() */
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-               jinode->i_flags |= JI_COMMIT_RUNNING;
+               set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                spin_unlock(&journal->j_list_lock);
                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
                if (err) {
                                ret = err;
                }
                spin_lock(&journal->j_list_lock);
-               jinode->i_flags &= ~JI_COMMIT_RUNNING;
+               clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+               smp_mb__after_clear_bit();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
  
@@@ -325,7 -364,7 +329,7 @@@ void jbd2_journal_commit_transaction(jo
        int tag_bytes = journal_tag_bytes(journal);
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
 -      int write_op = WRITE;
 +      int write_op = WRITE_SYNC;
  
        /*
         * First job: lock down the current transaction and wait for
@@@ -666,16 -705,6 +670,16 @@@ start_journal_io
                }
        }
  
 +      err = journal_finish_inode_data_buffers(journal, commit_transaction);
 +      if (err) {
 +              printk(KERN_WARNING
 +                      "JBD2: Detected IO errors while flushing file data "
 +                     "on %s\n", journal->j_devname);
 +              if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 +                      jbd2_journal_abort(journal, err);
 +              err = 0;
 +      }
 +
        /* 
         * If the journal is not located on the file system device,
         * then we must flush the file system device before we issue
        if (commit_transaction->t_flushed_data_blocks &&
            (journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
 -              blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
 -                      BLKDEV_IFL_WAIT);
 +              blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
  
        /* Done it all: now write the commit record asynchronously. */
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
                                                 &cbh, crc32_sum);
                if (err)
                        __jbd2_journal_abort_hard(journal);
 -              if (journal->j_flags & JBD2_BARRIER)
 -                      blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
 -                              BLKDEV_IFL_WAIT);
 -      }
 -
 -      err = journal_finish_inode_data_buffers(journal, commit_transaction);
 -      if (err) {
 -              printk(KERN_WARNING
 -                      "JBD2: Detected IO errors while flushing file data "
 -                     "on %s\n", journal->j_devname);
 -              if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 -                      jbd2_journal_abort(journal, err);
 -              err = 0;
        }
  
        /* Lo and behold: we have just managed to send a transaction to
@@@ -806,11 -849,6 +810,11 @@@ wait_for_iobuf
        }
        if (!err && !is_journal_aborted(journal))
                err = journal_wait_on_commit_record(journal, cbh);
 +      if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 +                                    JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
 +          journal->j_flags & JBD2_BARRIER) {
 +              blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
 +      }
  
        if (err)
                jbd2_journal_abort(journal, err);
diff --combined fs/jbd2/journal.c
  #include <linux/log2.h>
  #include <linux/vmalloc.h>
  #include <linux/backing-dev.h>
+ #include <linux/bitops.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/jbd2.h>
  
  #include <asm/uaccess.h>
  #include <asm/page.h>
+ #include <asm/system.h>
  
  EXPORT_SYMBOL(jbd2_journal_extend);
  EXPORT_SYMBOL(jbd2_journal_stop);
@@@ -1371,10 -1373,6 +1373,10 @@@ int jbd2_journal_check_used_features (j
  
        if (!compat && !ro && !incompat)
                return 1;
 +      /* Load journal superblock if it is not loaded yet. */
 +      if (journal->j_format_version == 0 &&
 +          journal_get_superblock(journal) != 0)
 +              return 0;
        if (journal->j_format_version == 1)
                return 0;
  
@@@ -2210,7 -2208,7 +2212,7 @@@ void jbd2_journal_release_jbd_inode(jou
  restart:
        spin_lock(&journal->j_list_lock);
        /* Is commit writing out inode - we have to wait */
-       if (jinode->i_flags & JI_COMMIT_RUNNING) {
+       if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
                wait_queue_head_t *wq;
                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
diff --combined include/linux/blkdev.h
@@@ -124,9 -124,6 +124,9 @@@ struct request 
         * physical address coalescing is performed.
         */
        unsigned short nr_phys_segments;
 +#if defined(CONFIG_BLK_DEV_INTEGRITY)
 +      unsigned short nr_integrity_segments;
 +#endif
  
        unsigned short ioprio;
  
@@@ -246,7 -243,6 +246,7 @@@ struct queue_limits 
  
        unsigned short          logical_block_size;
        unsigned short          max_segments;
 +      unsigned short          max_integrity_segments;
  
        unsigned char           misaligned;
        unsigned char           discard_misaligned;
@@@ -359,25 -355,18 +359,25 @@@ struct request_queu
        struct blk_trace        *blk_trace;
  #endif
        /*
 -       * reserved for flush operations
 +       * for flush operations
         */
 -      unsigned int            ordered, next_ordered, ordseq;
 -      int                     orderr, ordcolor;
 -      struct request          pre_flush_rq, bar_rq, post_flush_rq;
 -      struct request          *orig_bar_rq;
 +      unsigned int            flush_flags;
 +      unsigned int            flush_seq;
 +      int                     flush_err;
 +      struct request          flush_rq;
 +      struct request          *orig_flush_rq;
 +      struct list_head        pending_flushes;
  
        struct mutex            sysfs_lock;
  
  #if defined(CONFIG_BLK_DEV_BSG)
        struct bsg_class_device bsg_dev;
  #endif
 +
 +#ifdef CONFIG_BLK_DEV_THROTTLING
 +      /* Throttle data */
 +      struct throtl_data *td;
 +#endif
  };
  
  #define QUEUE_FLAG_CLUSTER    0       /* cluster several segments into 1 */
@@@ -473,6 -462,56 +473,6 @@@ static inline void queue_flag_clear(uns
        __clear_bit(flag, &q->queue_flags);
  }
  
 -enum {
 -      /*
 -       * Hardbarrier is supported with one of the following methods.
 -       *
 -       * NONE         : hardbarrier unsupported
 -       * DRAIN        : ordering by draining is enough
 -       * DRAIN_FLUSH  : ordering by draining w/ pre and post flushes
 -       * DRAIN_FUA    : ordering by draining w/ pre flush and FUA write
 -       * TAG          : ordering by tag is enough
 -       * TAG_FLUSH    : ordering by tag w/ pre and post flushes
 -       * TAG_FUA      : ordering by tag w/ pre flush and FUA write
 -       */
 -      QUEUE_ORDERED_BY_DRAIN          = 0x01,
 -      QUEUE_ORDERED_BY_TAG            = 0x02,
 -      QUEUE_ORDERED_DO_PREFLUSH       = 0x10,
 -      QUEUE_ORDERED_DO_BAR            = 0x20,
 -      QUEUE_ORDERED_DO_POSTFLUSH      = 0x40,
 -      QUEUE_ORDERED_DO_FUA            = 0x80,
 -
 -      QUEUE_ORDERED_NONE              = 0x00,
 -
 -      QUEUE_ORDERED_DRAIN             = QUEUE_ORDERED_BY_DRAIN |
 -                                        QUEUE_ORDERED_DO_BAR,
 -      QUEUE_ORDERED_DRAIN_FLUSH       = QUEUE_ORDERED_DRAIN |
 -                                        QUEUE_ORDERED_DO_PREFLUSH |
 -                                        QUEUE_ORDERED_DO_POSTFLUSH,
 -      QUEUE_ORDERED_DRAIN_FUA         = QUEUE_ORDERED_DRAIN |
 -                                        QUEUE_ORDERED_DO_PREFLUSH |
 -                                        QUEUE_ORDERED_DO_FUA,
 -
 -      QUEUE_ORDERED_TAG               = QUEUE_ORDERED_BY_TAG |
 -                                        QUEUE_ORDERED_DO_BAR,
 -      QUEUE_ORDERED_TAG_FLUSH         = QUEUE_ORDERED_TAG |
 -                                        QUEUE_ORDERED_DO_PREFLUSH |
 -                                        QUEUE_ORDERED_DO_POSTFLUSH,
 -      QUEUE_ORDERED_TAG_FUA           = QUEUE_ORDERED_TAG |
 -                                        QUEUE_ORDERED_DO_PREFLUSH |
 -                                        QUEUE_ORDERED_DO_FUA,
 -
 -      /*
 -       * Ordered operation sequence
 -       */
 -      QUEUE_ORDSEQ_STARTED    = 0x01, /* flushing in progress */
 -      QUEUE_ORDSEQ_DRAIN      = 0x02, /* waiting for the queue to be drained */
 -      QUEUE_ORDSEQ_PREFLUSH   = 0x04, /* pre-flushing in progress */
 -      QUEUE_ORDSEQ_BAR        = 0x08, /* original barrier req in progress */
 -      QUEUE_ORDSEQ_POSTFLUSH  = 0x10, /* post-flushing in progress */
 -      QUEUE_ORDSEQ_DONE       = 0x20,
 -};
 -
  #define blk_queue_plugged(q)  test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
  #define blk_queue_tagged(q)   test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
  #define blk_queue_stopped(q)  test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
  #define blk_queue_nonrot(q)   test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
  #define blk_queue_io_stat(q)  test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
  #define blk_queue_add_random(q)       test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
 -#define blk_queue_flushing(q) ((q)->ordseq)
  #define blk_queue_stackable(q)        \
        test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
  #define blk_queue_discard(q)  test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
@@@ -552,8 -592,7 +552,8 @@@ static inline void blk_clear_queue_full
   * it already be started by driver.
   */
  #define RQ_NOMERGE_FLAGS      \
 -      (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
 +      (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER | \
 +       REQ_FLUSH | REQ_FUA)
  #define rq_mergeable(rq)      \
        (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
         (((rq)->cmd_flags & REQ_DISCARD) || \
@@@ -812,7 -851,7 +812,7 @@@ extern void blk_queue_max_segment_size(
  extern void blk_queue_max_discard_sectors(struct request_queue *q,
                unsigned int max_discard_sectors);
  extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 -extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
 +extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
  extern void blk_queue_alignment_offset(struct request_queue *q,
                                       unsigned int alignment);
  extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
@@@ -842,8 -881,12 +842,8 @@@ extern void blk_queue_update_dma_alignm
  extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
  extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
  extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 +extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
  extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 -extern int blk_queue_ordered(struct request_queue *, unsigned);
 -extern bool blk_do_ordered(struct request_queue *, struct request **);
 -extern unsigned blk_ordered_cur_seq(struct request_queue *);
 -extern unsigned blk_ordered_req_seq(struct request *);
 -extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
  
  extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
  extern void blk_dump_rq_flags(struct request *, char *);
@@@ -876,21 -919,36 +876,29 @@@ static inline struct request *blk_map_q
                return NULL;
        return bqt->tag_index[tag];
  }
 -enum{
 -      BLKDEV_WAIT,    /* wait for completion */
 -      BLKDEV_BARRIER, /* issue request with barrier */
 -      BLKDEV_SECURE,  /* secure discard */
 -};
 -#define BLKDEV_IFL_WAIT               (1 << BLKDEV_WAIT)
 -#define BLKDEV_IFL_BARRIER    (1 << BLKDEV_BARRIER)
 -#define BLKDEV_IFL_SECURE     (1 << BLKDEV_SECURE)
 -extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
 -                      unsigned long);
 +
 +#define BLKDEV_DISCARD_SECURE  0x01    /* secure discard */
 +
 +extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
  extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
  extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 -                      sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 -static inline int sb_issue_discard(struct super_block *sb,
 -                                 sector_t block, sector_t nr_blocks)
 +                      sector_t nr_sects, gfp_t gfp_mask);
 +static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 +              sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
  {
 -      block <<= (sb->s_blocksize_bits - 9);
 -      nr_blocks <<= (sb->s_blocksize_bits - 9);
 -      return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS,
 -                                 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 +      return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - 9),
 +                                  nr_blocks << (sb->s_blocksize_bits - 9),
 +                                  gfp_mask, flags);
  }
 -              sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
+ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 -                                  gfp_mask, flags);
++              sector_t nr_blocks, gfp_t gfp_mask)
+ {
+       return blkdev_issue_zeroout(sb->s_bdev,
+                                   block << (sb->s_blocksize_bits - 9),
+                                   nr_blocks << (sb->s_blocksize_bits - 9),
++                                  gfp_mask);
+ }
  
  extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
  
@@@ -954,7 -1012,7 +962,7 @@@ static inline unsigned int queue_physic
        return q->limits.physical_block_size;
  }
  
 -static inline int bdev_physical_block_size(struct block_device *bdev)
 +static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
  {
        return queue_physical_block_size(bdev_get_queue(bdev));
  }
@@@ -1043,11 -1101,11 +1051,11 @@@ static inline int queue_dma_alignment(s
        return q ? q->dma_alignment : 511;
  }
  
 -static inline int blk_rq_aligned(struct request_queue *q, void *addr,
 +static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
                                 unsigned int len)
  {
        unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
 -      return !((unsigned long)addr & alignment) && !(len & alignment);
 +      return !(addr & alignment) && !(len & alignment);
  }
  
  /* assumes size > 256 */
@@@ -1077,7 -1135,6 +1085,7 @@@ static inline void put_dev_sector(Secto
  
  struct work_struct;
  int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
 +int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
  
  #ifdef CONFIG_BLK_CGROUP
  /*
@@@ -1121,24 -1178,6 +1129,24 @@@ static inline uint64_t rq_io_start_time
  }
  #endif
  
 +#ifdef CONFIG_BLK_DEV_THROTTLING
 +extern int blk_throtl_init(struct request_queue *q);
 +extern void blk_throtl_exit(struct request_queue *q);
 +extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
 +extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay);
 +extern void throtl_shutdown_timer_wq(struct request_queue *q);
 +#else /* CONFIG_BLK_DEV_THROTTLING */
 +static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
 +{
 +      return 0;
 +}
 +
 +static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 +static inline int blk_throtl_exit(struct request_queue *q) { return 0; }
 +static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {}
 +static inline void throtl_shutdown_timer_wq(struct request_queue *q) {}
 +#endif /* CONFIG_BLK_DEV_THROTTLING */
 +
  #define MODULE_ALIAS_BLOCKDEV(major,minor) \
        MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
  #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
@@@ -1182,13 -1221,8 +1190,13 @@@ struct blk_integrity 
  extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
  extern void blk_integrity_unregister(struct gendisk *);
  extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
 -extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
 -extern int blk_rq_count_integrity_sg(struct request *);
 +extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
 +                                 struct scatterlist *);
 +extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
 +extern int blk_integrity_merge_rq(struct request_queue *, struct request *,
 +                                struct request *);
 +extern int blk_integrity_merge_bio(struct request_queue *, struct request *,
 +                                 struct bio *);
  
  static inline
  struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
@@@ -1209,32 -1243,16 +1217,32 @@@ static inline int blk_integrity_rq(stru
        return bio_integrity(rq->bio);
  }
  
 +static inline void blk_queue_max_integrity_segments(struct request_queue *q,
 +                                                  unsigned int segs)
 +{
 +      q->limits.max_integrity_segments = segs;
 +}
 +
 +static inline unsigned short
 +queue_max_integrity_segments(struct request_queue *q)
 +{
 +      return q->limits.max_integrity_segments;
 +}
 +
  #else /* CONFIG_BLK_DEV_INTEGRITY */
  
  #define blk_integrity_rq(rq)                  (0)
 -#define blk_rq_count_integrity_sg(a)          (0)
 -#define blk_rq_map_integrity_sg(a, b)         (0)
 +#define blk_rq_count_integrity_sg(a, b)               (0)
 +#define blk_rq_map_integrity_sg(a, b, c)      (0)
  #define bdev_get_integrity(a)                 (0)
  #define blk_get_integrity(a)                  (0)
  #define blk_integrity_compare(a, b)           (0)
  #define blk_integrity_register(a, b)          (0)
  #define blk_integrity_unregister(a)           do { } while (0);
 +#define blk_queue_max_integrity_segments(a, b)        do { } while (0);
 +#define queue_max_integrity_segments(a)               (0)
 +#define blk_integrity_merge_rq(a, b, c)               (0)
 +#define blk_integrity_merge_bio(a, b, c)      (0)
  
  #endif /* CONFIG_BLK_DEV_INTEGRITY */
  
diff --combined include/linux/fs.h
  #define SEEK_END      2       /* seek relative to end of file */
  #define SEEK_MAX      SEEK_END
  
+ struct fstrim_range {
+       uint64_t start;
+       uint64_t len;
+       uint64_t minlen;
+ };
  /* And dynamically-tunable limits and defaults: */
  struct files_stat_struct {
 -      int nr_files;           /* read only */
 -      int nr_free_files;      /* read only */
 -      int max_files;          /* tunable */
 +      unsigned long nr_files;         /* read only */
 +      unsigned long nr_free_files;    /* read only */
 +      unsigned long max_files;                /* tunable */
  };
  
  struct inodes_stat_t {
@@@ -92,9 -98,6 +98,9 @@@
  /* Expect random access pattern */
  #define FMODE_RANDOM          ((__force fmode_t)0x1000)
  
 +/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
 +#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000)
 +
  /* File was opened by fanotify and shouldn't generate fanotify events */
  #define FMODE_NONOTIFY                ((__force fmode_t)0x1000000)
  
   *                    immediately after submission. The write equivalent
   *                    of READ_SYNC.
   * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only.
 - * WRITE_BARRIER      Like WRITE_SYNC, but tells the block layer that all
 - *                    previously submitted writes must be safely on storage
 - *                    before this one is started. Also guarantees that when
 - *                    this write is complete, it itself is also safely on
 - *                    storage. Prevents reordering of writes on both sides
 - *                    of this IO.
 + * WRITE_FLUSH                Like WRITE_SYNC but with preceding cache flush.
 + * WRITE_FUA          Like WRITE_SYNC but data is guaranteed to be on
 + *                    non-volatile media on completion.
 + * WRITE_FLUSH_FUA    Combination of WRITE_FLUSH and FUA. The IO is preceded
 + *                    by a cache flush and data is guaranteed to be on
 + *                    non-volatile media on completion.
   *
   */
  #define RW_MASK                       REQ_WRITE
  #define WRITE_SYNC            (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
  #define WRITE_ODIRECT_PLUG    (WRITE | REQ_SYNC)
  #define WRITE_META            (WRITE | REQ_META)
 -#define WRITE_BARRIER         (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 -                               REQ_HARDBARRIER)
 -
 -/*
 - * These aren't really reads or writes, they pass down information about
 - * parts of device that are now unused by the file system.
 - */
 -#define DISCARD_NOBARRIER     (WRITE | REQ_DISCARD)
 -#define DISCARD_BARRIER               (WRITE | REQ_DISCARD | REQ_HARDBARRIER)
 -#define DISCARD_SECURE                (DISCARD_NOBARRIER | REQ_SECURE)
 +#define WRITE_FLUSH           (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 +                               REQ_FLUSH)
 +#define WRITE_FUA             (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 +                               REQ_FUA)
 +#define WRITE_FLUSH_FUA               (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 +                               REQ_FLUSH | REQ_FUA)
  
  #define SEL_IN                1
  #define SEL_OUT               2
  #define S_NOCMTIME    128     /* Do not update file c/mtime */
  #define S_SWAPFILE    256     /* Do not truncate: swapon got its bmaps */
  #define S_PRIVATE     512     /* Inode is fs-internal */
 +#define S_IMA         1024    /* Inode has an associated IMA struct */
  
  /*
   * Note that nosuid etc flags are inode-specific: setting some file-system
  #define IS_NOCMTIME(inode)    ((inode)->i_flags & S_NOCMTIME)
  #define IS_SWAPFILE(inode)    ((inode)->i_flags & S_SWAPFILE)
  #define IS_PRIVATE(inode)     ((inode)->i_flags & S_PRIVATE)
 +#define IS_IMA(inode)         ((inode)->i_flags & S_IMA)
  
  /* the read-only stuff doesn't really belong here, but any other place is
     probably as bad and I don't want to create yet another include file. */
  #define FIGETBSZ   _IO(0x00,2)        /* get the block size used for bmap */
  #define FIFREEZE      _IOWR('X', 119, int)    /* Freeze */
  #define FITHAW                _IOWR('X', 120, int)    /* Thaw */
+ #define FITRIM                _IOWR('X', 121, struct fstrim_range)    /* Trim */
  
  #define       FS_IOC_GETFLAGS                 _IOR('f', 1, long)
  #define       FS_IOC_SETFLAGS                 _IOW('f', 2, long)
@@@ -405,7 -411,7 +412,7 @@@ extern void __init inode_init_early(voi
  extern void __init files_init(unsigned long);
  
  extern struct files_stat_struct files_stat;
 -extern int get_max_files(void);
 +extern unsigned long get_max_files(void);
  extern int sysctl_nr_open;
  extern struct inodes_stat_t inodes_stat;
  extern int leases_enable, lease_break_time;
@@@ -725,8 -731,7 +732,8 @@@ struct posix_acl
  
  struct inode {
        struct hlist_node       i_hash;
 -      struct list_head        i_list;         /* backing dev IO list */
 +      struct list_head        i_wb_list;      /* backing dev IO list */
 +      struct list_head        i_lru;          /* inode LRU list */
        struct list_head        i_sb_list;
        struct list_head        i_dentry;
        unsigned long           i_ino;
  
        unsigned int            i_flags;
  
 +#ifdef CONFIG_IMA
 +      /* protected by i_lock */
 +      unsigned int            i_readcount; /* struct files open RO */
 +#endif
        atomic_t                i_writecount;
  #ifdef CONFIG_SECURITY
        void                    *i_security;
        void                    *i_private; /* fs or device private pointer */
  };
  
 +static inline int inode_unhashed(struct inode *inode)
 +{
 +      return hlist_unhashed(&inode->i_hash);
 +}
 +
  /*
   * inode->i_mutex nesting subclasses for the lock validator:
   *
@@@ -1104,6 -1100,10 +1111,6 @@@ struct file_lock 
  
  #include <linux/fcntl.h>
  
 -/* temporary stubs for BKL removal */
 -#define lock_flocks() lock_kernel()
 -#define unlock_flocks() unlock_kernel()
 -
  extern void send_sigio(struct fown_struct *fown, int fd, int band);
  
  #ifdef CONFIG_FILE_LOCKING
@@@ -1122,7 -1122,6 +1129,7 @@@ extern int fcntl_getlease(struct file *
  
  /* fs/locks.c */
  extern void locks_init_lock(struct file_lock *);
 +extern struct file_lock * locks_alloc_lock(void);
  extern void locks_copy_lock(struct file_lock *, struct file_lock *);
  extern void __locks_copy_lock(struct file_lock *, const struct file_lock *);
  extern void locks_remove_posix(struct file *, fl_owner_t);
@@@ -1143,8 -1142,6 +1150,8 @@@ extern int vfs_setlease(struct file *, 
  extern int lease_modify(struct file_lock **, int);
  extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
  extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
 +extern void lock_flocks(void);
 +extern void unlock_flocks(void);
  #else /* !CONFIG_FILE_LOCKING */
  static inline int fcntl_getlk(struct file *file, struct flock __user *user)
  {
@@@ -1287,14 -1284,6 +1294,14 @@@ static inline int lock_may_write(struc
        return 1;
  }
  
 +static inline void lock_flocks(void)
 +{
 +}
 +
 +static inline void unlock_flocks(void)
 +{
 +}
 +
  #endif /* !CONFIG_FILE_LOCKING */
  
  
@@@ -1311,11 -1300,6 +1318,11 @@@ struct fasync_struct 
  
  /* SMP safe fasync helpers: */
  extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
 +extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *);
 +extern int fasync_remove_entry(struct file *, struct fasync_struct **);
 +extern struct fasync_struct *fasync_alloc(void);
 +extern void fasync_free(struct fasync_struct *);
 +
  /* can be called from interrupts */
  extern void kill_fasync(struct fasync_struct **, int, int);
  
@@@ -1407,7 -1391,7 +1414,7 @@@ struct super_block 
         * Saved mount options for lazy filesystems using
         * generic_show_options()
         */
 -      char *s_options;
 +      char __rcu *s_options;
  };
  
  extern struct timespec current_fs_time(struct super_block *sb);
@@@ -1604,6 -1588,7 +1611,7 @@@ struct super_operations 
        ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
  #endif
        int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
+       int (*trim_fs) (struct super_block *, struct fstrim_range *);
  };
  
  /*
   *
   * Q: What is the difference between I_WILL_FREE and I_FREEING?
   */
 -#define I_DIRTY_SYNC          1
 -#define I_DIRTY_DATASYNC      2
 -#define I_DIRTY_PAGES         4
 +#define I_DIRTY_SYNC          (1 << 0)
 +#define I_DIRTY_DATASYNC      (1 << 1)
 +#define I_DIRTY_PAGES         (1 << 2)
  #define __I_NEW                       3
  #define I_NEW                 (1 << __I_NEW)
 -#define I_WILL_FREE           16
 -#define I_FREEING             32
 -#define I_CLEAR                       64
 +#define I_WILL_FREE           (1 << 4)
 +#define I_FREEING             (1 << 5)
 +#define I_CLEAR                       (1 << 6)
  #define __I_SYNC              7
  #define I_SYNC                        (1 << __I_SYNC)
 +#define I_REFERENCED          (1 << 8)
  
  #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
  
@@@ -1756,7 -1740,6 +1764,7 @@@ static inline void file_accessed(struc
  }
  
  int sync_inode(struct inode *inode, struct writeback_control *wbc);
 +int sync_inode_metadata(struct inode *inode, int wait);
  
  struct file_system_type {
        const char *name;
@@@ -2101,6 -2084,7 +2109,6 @@@ extern int check_disk_change(struct blo
  extern int __invalidate_device(struct block_device *);
  extern int invalidate_partition(struct gendisk *, int);
  #endif
 -extern int invalidate_inodes(struct super_block *);
  unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                        pgoff_t start, pgoff_t end);
  
@@@ -2184,7 -2168,7 +2192,7 @@@ extern loff_t vfs_llseek(struct file *f
  
  extern int inode_init_always(struct super_block *, struct inode *);
  extern void inode_init_once(struct inode *);
 -extern void inode_add_to_lists(struct super_block *, struct inode *);
 +extern void ihold(struct inode * inode);
  extern void iput(struct inode *);
  extern struct inode * igrab(struct inode *);
  extern ino_t iunique(struct super_block *, ino_t);
@@@ -2204,11 -2188,11 +2212,11 @@@ extern struct inode * iget_locked(struc
  extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
  extern int insert_inode_locked(struct inode *);
  extern void unlock_new_inode(struct inode *);
 +extern unsigned int get_next_ino(void);
  
  extern void __iget(struct inode * inode);
  extern void iget_failed(struct inode *);
  extern void end_writeback(struct inode *);
 -extern void destroy_inode(struct inode *);
  extern void __destroy_inode(struct inode *);
  extern struct inode *new_inode(struct super_block *);
  extern int should_remove_suid(struct dentry *);
@@@ -2216,11 -2200,9 +2224,11 @@@ extern int file_remove_suid(struct fil
  
  extern void __insert_inode_hash(struct inode *, unsigned long hashval);
  extern void remove_inode_hash(struct inode *);
 -static inline void insert_inode_hash(struct inode *inode) {
 +static inline void insert_inode_hash(struct inode *inode)
 +{
        __insert_inode_hash(inode, inode->i_ino);
  }
 +extern void inode_sb_list_add(struct inode *inode);
  
  #ifdef CONFIG_BLOCK
  extern void submit_bio(int, struct bio *);
@@@ -2404,8 -2386,6 +2412,8 @@@ extern ssize_t simple_write_to_buffer(v
  
  extern int generic_file_fsync(struct file *, int);
  
 +extern int generic_check_addressable(unsigned, u64);
 +
  #ifdef CONFIG_MIGRATION
  extern int buffer_migrate_page(struct address_space *,
                                struct page *, struct page *);
@@@ -2482,7 -2462,6 +2490,7 @@@ static const struct file_operations __f
        .release = simple_attr_release,                                 \
        .read    = simple_attr_read,                                    \
        .write   = simple_attr_write,                                   \
 +      .llseek  = generic_file_llseek,                                 \
  };
  
  static inline void __attribute__((format(printf, 1, 2)))
@@@ -2503,10 -2482,7 +2511,10 @@@ ssize_t simple_attr_write(struct file *
  struct ctl_table;
  int proc_nr_files(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
 -
 +int proc_nr_dentry(struct ctl_table *table, int write,
 +                void __user *buffer, size_t *lenp, loff_t *ppos);
 +int proc_nr_inodes(struct ctl_table *table, int write,
 +                 void __user *buffer, size_t *lenp, loff_t *ppos);
  int __init get_filesystem_list(char *buf);
  
  #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
@@@ -10,6 -10,8 +10,6 @@@
  struct backing_dev_info;
  
  extern spinlock_t inode_lock;
 -extern struct list_head inode_in_use;
 -extern struct list_head inode_unused;
  
  /*
   * fs/fs-writeback.c
@@@ -141,14 -143,14 +141,16 @@@ typedef int (*writepage_t)(struct page 
  
  int generic_writepages(struct address_space *mapping,
                       struct writeback_control *wbc);
+ void tag_pages_for_writeback(struct address_space *mapping,
+                            pgoff_t start, pgoff_t end);
  int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
                      void *data);
  int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
  void set_page_dirty_balance(struct page *page, int page_mkwrite);
  void writeback_set_ratelimit(void);
 +void tag_pages_for_writeback(struct address_space *mapping,
 +                           pgoff_t start, pgoff_t end);
  
  /* pdflush.c */
  extern int nr_pdflush_threads;        /* Global so it can be exported to sysctl
@@@ -21,7 -21,8 +21,8 @@@ TRACE_EVENT(ext4_free_inode
        TP_ARGS(inode),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        umode_t, mode                   )
                __field(        uid_t,  uid                     )
@@@ -30,7 -31,8 +31,8 @@@
        ),
  
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->mode   = inode->i_mode;
                __entry->uid    = inode->i_uid;
                __entry->blocks = inode->i_blocks;
        ),
  
-       TP_printk("dev %s ino %lu mode 0%o uid %u gid %u blocks %llu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->mode, __entry->uid, __entry->gid,
+       TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->mode,
+                 __entry->uid, __entry->gid,
                  (unsigned long long) __entry->blocks)
  );
  
@@@ -50,20 -53,22 +53,22 @@@ TRACE_EVENT(ext4_request_inode
        TP_ARGS(dir, mode),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  dir                     )
                __field(        umode_t, mode                   )
        ),
  
        TP_fast_assign(
-               __entry->dev    = dir->i_sb->s_dev;
+               __entry->dev_major = MAJOR(dir->i_sb->s_dev);
+               __entry->dev_minor = MINOR(dir->i_sb->s_dev);
                __entry->dir    = dir->i_ino;
                __entry->mode   = mode;
        ),
  
-       TP_printk("dev %s dir %lu mode 0%o",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir,
-                 __entry->mode)
+       TP_printk("dev %d,%d dir %lu mode 0%o",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->dir, __entry->mode)
  );
  
  TRACE_EVENT(ext4_allocate_inode,
        TP_ARGS(inode, dir, mode),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        ino_t,  dir                     )
                __field(        umode_t, mode                   )
        ),
  
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->dir    = dir->i_ino;
                __entry->mode   = mode;
        ),
  
-       TP_printk("dev %s ino %lu dir %lu mode 0%o",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  (unsigned long) __entry->dir, __entry->mode)
  );
  
@@@ -98,7 -106,8 +106,8 @@@ DECLARE_EVENT_CLASS(ext4__write_begin
        TP_ARGS(inode, pos, len, flags),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        loff_t, pos                     )
                __field(        unsigned int, len               )
        ),
  
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->pos    = pos;
                __entry->len    = len;
                __entry->flags  = flags;
        ),
  
-       TP_printk("dev %s ino %lu pos %llu len %u flags %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->pos, __entry->len, __entry->flags)
  );
  
@@@ -141,7 -152,8 +152,8 @@@ DECLARE_EVENT_CLASS(ext4__write_end
        TP_ARGS(inode, pos, len, copied),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        loff_t, pos                     )
                __field(        unsigned int, len               )
        ),
  
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->pos    = pos;
                __entry->len    = len;
                __entry->copied = copied;
        ),
  
-       TP_printk("dev %s ino %lu pos %llu len %u copied %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->pos, __entry->len, __entry->copied)
+       TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->pos,
+                 __entry->len, __entry->copied)
  );
  
  DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end,
@@@ -199,21 -213,23 +213,23 @@@ TRACE_EVENT(ext4_writepage
        TP_ARGS(inode, page),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        pgoff_t, index                  )
  
        ),
  
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->index  = page->index;
        ),
  
-       TP_printk("dev %s ino %lu page_index %lu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->index)
+       TP_printk("dev %d,%d ino %lu page_index %lu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->index)
  );
  
  TRACE_EVENT(ext4_da_writepages,
        TP_ARGS(inode, wbc),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        long,   nr_to_write             )
                __field(        long,   pages_skipped           )
                __field(        loff_t, range_start             )
                __field(        loff_t, range_end               )
--              __field(        char,   nonblocking             )
                __field(        char,   for_kupdate             )
                __field(        char,   for_reclaim             )
                __field(        char,   range_cyclic            )
        ),
  
        TP_fast_assign(
-               __entry->dev            = inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(inode->i_sb->s_dev);
                __entry->ino            = inode->i_ino;
                __entry->nr_to_write    = wbc->nr_to_write;
                __entry->pages_skipped  = wbc->pages_skipped;
                __entry->range_start    = wbc->range_start;
                __entry->range_end      = wbc->range_end;
 -              __entry->nonblocking    = wbc->nonblocking;
                __entry->for_kupdate    = wbc->for_kupdate;
                __entry->for_reclaim    = wbc->for_reclaim;
                __entry->range_cyclic   = wbc->range_cyclic;
                __entry->writeback_index = inode->i_mapping->writeback_index;
        ),
  
-       TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld "
 -      TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu",
++      TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
 +                "range_start %llu range_end %llu "
 +                "for_kupdate %d for_reclaim %d "
 +                "range_cyclic %d writeback_index %lu",
-                 jbd2_dev_to_name(__entry->dev),
+                 __entry->dev_major, __entry->dev_minor,
                  (unsigned long) __entry->ino, __entry->nr_to_write,
                  __entry->pages_skipped, __entry->range_start,
 -                __entry->range_end, __entry->nonblocking,
 +                __entry->range_end,
                  __entry->for_kupdate, __entry->for_reclaim,
                  __entry->range_cyclic,
                  (unsigned long) __entry->writeback_index)
@@@ -267,7 -283,8 +284,8 @@@ TRACE_EVENT(ext4_da_write_pages
        TP_ARGS(inode, mpd),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        __u64,  b_blocknr               )
                __field(        __u32,  b_size                  )
        ),
  
        TP_fast_assign(
-               __entry->dev            = inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(inode->i_sb->s_dev);
                __entry->ino            = inode->i_ino;
                __entry->b_blocknr      = mpd->b_blocknr;
                __entry->b_size         = mpd->b_size;
                __entry->pages_written  = mpd->pages_written;
        ),
  
-       TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->b_blocknr, __entry->b_size,
                  __entry->b_state, __entry->first_page,
                  __entry->io_done, __entry->pages_written)
@@@ -302,7 -321,8 +322,8 @@@ TRACE_EVENT(ext4_da_writepages_result
        TP_ARGS(inode, wbc, ret, pages_written),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        int,    ret                     )
                __field(        int,    pages_written           )
        ),
  
        TP_fast_assign(
-               __entry->dev            = inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(inode->i_sb->s_dev);
                __entry->ino            = inode->i_ino;
                __entry->ret            = ret;
                __entry->pages_written  = pages_written;
                __entry->writeback_index = inode->i_mapping->writeback_index;
        ),
  
-       TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld more_io %d writeback_index %lu",
-                 jbd2_dev_to_name(__entry->dev),
+       TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld more_io %d writeback_index %lu",
+                 __entry->dev_major, __entry->dev_minor,
                  (unsigned long) __entry->ino, __entry->ret,
                  __entry->pages_written, __entry->pages_skipped,
                  __entry->more_io,
@@@ -336,20 -357,23 +358,23 @@@ TRACE_EVENT(ext4_discard_blocks
        TP_ARGS(sb, blk, count),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        __u64,  blk                     )
                __field(        __u64,  count                   )
  
        ),
  
        TP_fast_assign(
-               __entry->dev    = sb->s_dev;
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
                __entry->blk    = blk;
                __entry->count  = count;
        ),
  
-       TP_printk("dev %s blk %llu count %llu",
-                 jbd2_dev_to_name(__entry->dev), __entry->blk, __entry->count)
+       TP_printk("dev %d,%d blk %llu count %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 __entry->blk, __entry->count)
  );
  
  DECLARE_EVENT_CLASS(ext4__mb_new_pa,
        TP_ARGS(ac, pa),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        __u64,  pa_pstart               )
                __field(        __u32,  pa_len                  )
        ),
  
        TP_fast_assign(
-               __entry->dev            = ac->ac_sb->s_dev;
+               __entry->dev_major      = MAJOR(ac->ac_sb->s_dev);
+               __entry->dev_minor      = MINOR(ac->ac_sb->s_dev);
                __entry->ino            = ac->ac_inode->i_ino;
                __entry->pa_pstart      = pa->pa_pstart;
                __entry->pa_len         = pa->pa_len;
                __entry->pa_lstart      = pa->pa_lstart;
        ),
  
-       TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
+       TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->pa_pstart,
+                 __entry->pa_len, __entry->pa_lstart)
  );
  
  DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa,
@@@ -398,14 -425,15 +426,15 @@@ DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_n
  
  TRACE_EVENT(ext4_mb_release_inode_pa,
        TP_PROTO(struct super_block *sb,
-                struct ext4_allocation_context *ac,
+                struct inode *inode,
                 struct ext4_prealloc_space *pa,
                 unsigned long long block, unsigned int count),
  
-       TP_ARGS(sb, ac, pa, block, count),
+       TP_ARGS(sb, inode, pa, block, count),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        __u64,  block                   )
                __field(        __u32,  count                   )
        ),
  
        TP_fast_assign(
-               __entry->dev            = sb->s_dev;
-               __entry->ino            = (ac && ac->ac_inode) ? 
-                                               ac->ac_inode->i_ino : 0;
+               __entry->dev_major      = MAJOR(sb->s_dev);
+               __entry->dev_minor      = MINOR(sb->s_dev);
+               __entry->ino            = inode->i_ino;
                __entry->block          = block;
                __entry->count          = count;
        ),
  
-       TP_printk("dev %s ino %lu block %llu count %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->block, __entry->count)
+       TP_printk("dev %d,%d ino %lu block %llu count %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->block, __entry->count)
  );
  
  TRACE_EVENT(ext4_mb_release_group_pa,
        TP_PROTO(struct super_block *sb,
-                struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),
  
-       TP_ARGS(sb, ac, pa),
+       TP_ARGS(sb, pa),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        ino_t,  ino                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        __u64,  pa_pstart               )
                __field(        __u32,  pa_len                  )
  
        ),
  
        TP_fast_assign(
-               __entry->dev            = sb->s_dev;
-               __entry->ino            = (ac && ac->ac_inode) ?
-                                               ac->ac_inode->i_ino : 0;
+               __entry->dev_major      = MAJOR(sb->s_dev);
+               __entry->dev_minor      = MINOR(sb->s_dev);
                __entry->pa_pstart      = pa->pa_pstart;
                __entry->pa_len         = pa->pa_len;
        ),
  
-       TP_printk("dev %s pstart %llu len %u",
-                 jbd2_dev_to_name(__entry->dev), __entry->pa_pstart, __entry->pa_len)
+       TP_printk("dev %d,%d pstart %llu len %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 __entry->pa_pstart, __entry->pa_len)
  );
  
  TRACE_EVENT(ext4_discard_preallocations,
        TP_ARGS(inode),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
  
        ),
  
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
        ),
  
-       TP_printk("dev %s ino %lu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino)
+       TP_printk("dev %d,%d ino %lu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino)
  );
  
  TRACE_EVENT(ext4_mb_discard_preallocations,
        TP_ARGS(sb, needed),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        int,    needed                  )
  
        ),
  
        TP_fast_assign(
-               __entry->dev    = sb->s_dev;
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
                __entry->needed = needed;
        ),
  
-       TP_printk("dev %s needed %d",
-                 jbd2_dev_to_name(__entry->dev), __entry->needed)
+       TP_printk("dev %d,%d needed %d",
+                 __entry->dev_major, __entry->dev_minor, __entry->needed)
  );
  
  TRACE_EVENT(ext4_request_blocks,
        TP_ARGS(ar),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        unsigned int, flags             )
                __field(        unsigned int, len               )
        ),
  
        TP_fast_assign(
-               __entry->dev    = ar->inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(ar->inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(ar->inode->i_sb->s_dev);
                __entry->ino    = ar->inode->i_ino;
                __entry->flags  = ar->flags;
                __entry->len    = ar->len;
                __entry->pright = ar->pright;
        ),
  
-       TP_printk("dev %s ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->flags, __entry->len,
                  (unsigned long long) __entry->logical,
                  (unsigned long long) __entry->goal,
@@@ -540,7 -575,8 +576,8 @@@ TRACE_EVENT(ext4_allocate_blocks
        TP_ARGS(ar, block),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        __u64,  block                   )
                __field(        unsigned int, flags             )
        ),
  
        TP_fast_assign(
-               __entry->dev    = ar->inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(ar->inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(ar->inode->i_sb->s_dev);
                __entry->ino    = ar->inode->i_ino;
                __entry->block  = block;
                __entry->flags  = ar->flags;
                __entry->pright = ar->pright;
        ),
  
-       TP_printk("dev %s ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->flags, __entry->len, __entry->block,
+       TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->flags,
+                 __entry->len, __entry->block,
                  (unsigned long long) __entry->logical,
                  (unsigned long long) __entry->goal,
                  (unsigned long long) __entry->lleft,
@@@ -585,7 -623,8 +624,8 @@@ TRACE_EVENT(ext4_free_blocks
        TP_ARGS(inode, block, count, flags),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(      umode_t, mode                     )
                __field(        __u64,  block                   )
        ),
  
        TP_fast_assign(
-               __entry->dev            = inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(inode->i_sb->s_dev);
                __entry->ino            = inode->i_ino;
                __entry->mode           = inode->i_mode;
                __entry->block          = block;
                __entry->flags          = flags;
        ),
  
-       TP_printk("dev %s ino %lu mode 0%o block %llu count %lu flags %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->mode, __entry->block, __entry->count,
                  __entry->flags)
  );
@@@ -614,7 -655,8 +656,8 @@@ TRACE_EVENT(ext4_sync_file
        TP_ARGS(file, datasync),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        ino_t,  parent                  )
                __field(        int,    datasync                )
        TP_fast_assign(
                struct dentry *dentry = file->f_path.dentry;
  
-               __entry->dev            = dentry->d_inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(dentry->d_inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(dentry->d_inode->i_sb->s_dev);
                __entry->ino            = dentry->d_inode->i_ino;
                __entry->datasync       = datasync;
                __entry->parent         = dentry->d_parent->d_inode->i_ino;
        ),
  
-       TP_printk("dev %s ino %ld parent %ld datasync %d ",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %ld parent %ld datasync %d ",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  (unsigned long) __entry->parent, __entry->datasync)
  );
  
@@@ -640,18 -684,20 +685,20 @@@ TRACE_EVENT(ext4_sync_fs
        TP_ARGS(sb, wait),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        int,    wait                    )
  
        ),
  
        TP_fast_assign(
-               __entry->dev    = sb->s_dev;
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
                __entry->wait   = wait;
        ),
  
-       TP_printk("dev %s wait %d", jbd2_dev_to_name(__entry->dev),
-                 __entry->wait)
+       TP_printk("dev %d,%d wait %d", __entry->dev_major,
+                 __entry->dev_minor, __entry->wait)
  );
  
  TRACE_EVENT(ext4_alloc_da_blocks,
        TP_ARGS(inode),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field( unsigned int,  data_blocks     )
                __field( unsigned int,  meta_blocks     )
        ),
  
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
        ),
  
-       TP_printk("dev %s ino %lu data_blocks %u meta_blocks %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu data_blocks %u meta_blocks %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->data_blocks, __entry->meta_blocks)
  );
  
@@@ -684,7 -733,8 +734,8 @@@ TRACE_EVENT(ext4_mballoc_alloc
        TP_ARGS(ac),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        __u16,  found                   )
                __field(        __u16,  groups                  )
        ),
  
        TP_fast_assign(
-               __entry->dev            = ac->ac_inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(ac->ac_inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(ac->ac_inode->i_sb->s_dev);
                __entry->ino            = ac->ac_inode->i_ino;
                __entry->found          = ac->ac_found;
                __entry->flags          = ac->ac_flags;
                __entry->result_len     = ac->ac_f_ex.fe_len;
        ),
  
-       TP_printk("dev %s inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
+       TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
                  "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x "
                  "tail %u broken %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->goal_group, __entry->goal_start,
@@@ -750,7 -802,8 +803,8 @@@ TRACE_EVENT(ext4_mballoc_prealloc
        TP_ARGS(ac),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        __u32,  orig_logical            )
                __field(          int,  orig_start              )
        ),
  
        TP_fast_assign(
-               __entry->dev            = ac->ac_inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(ac->ac_inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(ac->ac_inode->i_sb->s_dev);
                __entry->ino            = ac->ac_inode->i_ino;
                __entry->orig_logical   = ac->ac_o_ex.fe_logical;
                __entry->orig_start     = ac->ac_o_ex.fe_start;
                __entry->result_len     = ac->ac_b_ex.fe_len;
        ),
  
-       TP_printk("dev %s inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->result_group, __entry->result_start,
  );
  
  DECLARE_EVENT_CLASS(ext4__mballoc,
-       TP_PROTO(struct ext4_allocation_context *ac),
+       TP_PROTO(struct super_block *sb,
+                struct inode *inode,
+                ext4_group_t group,
+                ext4_grpblk_t start,
+                ext4_grpblk_t len),
  
-       TP_ARGS(ac),
+       TP_ARGS(sb, inode, group, start, len),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
-               __field(        __u32,  result_logical          )
                __field(          int,  result_start            )
                __field(        __u32,  result_group            )
                __field(          int,  result_len              )
        ),
  
        TP_fast_assign(
-               __entry->dev            = ac->ac_inode->i_sb->s_dev;
-               __entry->ino            = ac->ac_inode->i_ino;
-               __entry->result_logical = ac->ac_b_ex.fe_logical;
-               __entry->result_start   = ac->ac_b_ex.fe_start;
-               __entry->result_group   = ac->ac_b_ex.fe_group;
-               __entry->result_len     = ac->ac_b_ex.fe_len;
+               __entry->dev_major      = MAJOR(sb->s_dev);
+               __entry->dev_minor      = MINOR(sb->s_dev);
+               __entry->ino            = inode ? inode->i_ino : 0;
+               __entry->result_start   = start;
+               __entry->result_group   = group;
+               __entry->result_len     = len;
        ),
  
-       TP_printk("dev %s inode %lu extent %u/%d/%u@%u ",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d inode %lu extent %u/%d/%u ",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->result_group, __entry->result_start,
-                 __entry->result_len, __entry->result_logical)
+                 __entry->result_len)
  );
  
  DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard,
  
-       TP_PROTO(struct ext4_allocation_context *ac),
+       TP_PROTO(struct super_block *sb,
+                struct inode *inode,
+                ext4_group_t group,
+                ext4_grpblk_t start,
+                ext4_grpblk_t len),
  
-       TP_ARGS(ac)
+       TP_ARGS(sb, inode, group, start, len)
  );
  
  DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free,
  
-       TP_PROTO(struct ext4_allocation_context *ac),
+       TP_PROTO(struct super_block *sb,
+                struct inode *inode,
+                ext4_group_t group,
+                ext4_grpblk_t start,
+                ext4_grpblk_t len),
  
-       TP_ARGS(ac)
+       TP_ARGS(sb, inode, group, start, len)
  );
  
  TRACE_EVENT(ext4_forget,
        TP_ARGS(inode, is_metadata, block),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        umode_t, mode                   )
                __field(        int,    is_metadata             )
        ),
  
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->mode   = inode->i_mode;
                __entry->is_metadata = is_metadata;
                __entry->block  = block;
        ),
  
-       TP_printk("dev %s ino %lu mode 0%o is_metadata %d block %llu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->mode, __entry->is_metadata, __entry->block)
+       TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->mode,
+                 __entry->is_metadata, __entry->block)
  );
  
  TRACE_EVENT(ext4_da_update_reserve_space,
        TP_ARGS(inode, used_blocks),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        umode_t, mode                   )
                __field(        __u64,  i_blocks                )
        ),
  
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->mode   = inode->i_mode;
                __entry->i_blocks = inode->i_blocks;
                __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
        ),
  
-       TP_printk("dev %s ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->mode,  (unsigned long long) __entry->i_blocks,
+       TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->mode,
+                 (unsigned long long) __entry->i_blocks,
                  __entry->used_blocks, __entry->reserved_data_blocks,
                  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
  );
@@@ -892,7 -966,8 +967,8 @@@ TRACE_EVENT(ext4_da_reserve_space
        TP_ARGS(inode, md_needed),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        umode_t, mode                   )
                __field(        __u64,  i_blocks                )
        ),
  
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->mode   = inode->i_mode;
                __entry->i_blocks = inode->i_blocks;
                __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
        ),
  
-       TP_printk("dev %s ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->mode, (unsigned long long) __entry->i_blocks,
                  __entry->md_needed, __entry->reserved_data_blocks,
                  __entry->reserved_meta_blocks)
@@@ -924,7 -1001,8 +1002,8 @@@ TRACE_EVENT(ext4_da_release_space
        TP_ARGS(inode, freed_blocks),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        umode_t, mode                   )
                __field(        __u64,  i_blocks                )
        ),
  
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->mode   = inode->i_mode;
                __entry->i_blocks = inode->i_blocks;
                __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
        ),
  
-       TP_printk("dev %s ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->mode, (unsigned long long) __entry->i_blocks,
                  __entry->freed_blocks, __entry->reserved_data_blocks,
                  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
@@@ -958,18 -1038,20 +1039,20 @@@ DECLARE_EVENT_CLASS(ext4__bitmap_load
        TP_ARGS(sb, group),
  
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        __u32,  group                   )
  
        ),
  
        TP_fast_assign(
-               __entry->dev    = sb->s_dev;
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
                __entry->group  = group;
        ),
  
-       TP_printk("dev %s group %u",
-                 jbd2_dev_to_name(__entry->dev), __entry->group)
+       TP_printk("dev %d,%d group %u",
+                 __entry->dev_major, __entry->dev_minor, __entry->group)
  );
  
  DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load,