Merge branch 'upstream-merge' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 28 Oct 2010 04:54:31 +0000 (21:54 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 28 Oct 2010 04:54:31 +0000 (21:54 -0700)
* 'upstream-merge' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (50 commits)
  ext4,jbd2: convert tracepoints to use major/minor numbers
  ext4: optimize orphan_list handling for ext4_setattr
  ext4: fix unbalanced mutex unlock in error path of ext4_li_request_new
  ext4: fix compile error in ext4_fallocate()
  ext4: move ext4_mb_{get,put}_buddy_cache_lock and make them static
  ext4: rename mark_bitmap_end() to ext4_mark_bitmap_end()
  ext4: move flush_completed_IO to fs/ext4/fsync.c and make it static
  ext4: rename {ext,idx}_pblock and inline small extent functions
  ext4: make various ext4 functions be static
  ext4: rename {exit,init}_ext4_*() to ext4_{exit,init}_*()
  ext4: fix kernel oops if the journal superblock has a non-zero j_errno
  ext4: update writeback_index based on last page scanned
  ext4: implement writeback livelock avoidance using page tagging
  ext4: tidy up a void argument in inode.c
  ext4: add batched_discard into ext4 feature list
  ext4: Add batched discard support for ext4
  fs: Add FITRIM ioctl
  ext4: Use return value from sb_issue_discard()
  ext4: Check return value of sb_getblk() and friends
  ext4: use bio layer instead of buffer layer in mpage_da_submit_io
  ...

33 files changed:
Documentation/filesystems/ext4.txt
fs/ext4/Makefile
fs/ext4/balloc.c
fs/ext4/block_validity.c
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/ext4_extents.h
fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/fsync.c
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/mballoc.c
fs/ext4/migrate.c
fs/ext4/move_extent.c
fs/ext4/namei.c
fs/ext4/page-io.c [new file with mode: 0644]
fs/ext4/resize.c
fs/ext4/super.c
fs/ext4/xattr.c
fs/ext4/xattr.h
fs/ioctl.c
fs/jbd2/checkpoint.c
fs/jbd2/commit.c
fs/jbd2/journal.c
fs/jbd2/transaction.c
include/linux/blkdev.h
include/linux/fs.h
include/linux/jbd2.h
include/linux/percpu_counter.h
include/linux/writeback.h
include/trace/events/ext4.h
include/trace/events/jbd2.h

index e1def17..6ab9442 100644 (file)
@@ -353,6 +353,20 @@ noauto_da_alloc            replacing existing files via patterns such as
                        system crashes before the delayed allocation
                        blocks are forced to disk.
 
+noinit_itable          Do not initialize any uninitialized inode table
+                       blocks in the background.  This feature may be
+                       used by installation CD's so that the install
+                       process can complete as quickly as possible; the
+                       inode table initialization process would then be
+                       deferred until the next time the  file system
+                       is unmounted.
+
+init_itable=n          The lazy itable init code will wait n times the
+                       number of milliseconds it took to zero out the
+                       previous block group's inode table.  This
+                       minimizes the impact on the systme performance
+                       while file system's inode table is being initialized.
+
 discard                Controls whether ext4 should issue discard/TRIM
 nodiscard(*)           commands to the underlying block device when
                        blocks are freed.  This is useful for SSD devices
index 8867b2a..c947e36 100644 (file)
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_EXT4_FS) += ext4.o
 
-ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
 
index bd30799..14c3af2 100644 (file)
@@ -171,7 +171,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 * less than the blocksize * 8 ( which is the size
                 * of bitmap ), set rest of the block bitmap to 1
                 */
-               mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
+               ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8,
+                                    bh->b_data);
        }
        return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
@@ -489,7 +490,7 @@ error_return:
  * Check if filesystem has nblocks free & available for allocation.
  * On success return 1, return 0 on failure.
  */
-int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
        s64 free_blocks, dirty_blocks, root_blocks;
        struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
index 3db5084..fac90f3 100644 (file)
@@ -29,16 +29,15 @@ struct ext4_system_zone {
 
 static struct kmem_cache *ext4_system_zone_cachep;
 
-int __init init_ext4_system_zone(void)
+int __init ext4_init_system_zone(void)
 {
-       ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
-                                            SLAB_RECLAIM_ACCOUNT);
+       ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
        if (ext4_system_zone_cachep == NULL)
                return -ENOMEM;
        return 0;
 }
 
-void exit_ext4_system_zone(void)
+void ext4_exit_system_zone(void)
 {
        kmem_cache_destroy(ext4_system_zone_cachep);
 }
index 374510f..ece76fb 100644 (file)
@@ -39,7 +39,7 @@ static int ext4_release_dir(struct inode *inode,
                                struct file *filp);
 
 const struct file_operations ext4_dir_operations = {
-       .llseek         = generic_file_llseek,
+       .llseek         = ext4_llseek,
        .read           = generic_read_dir,
        .readdir        = ext4_readdir,         /* we take BKL. needed?*/
        .unlocked_ioctl = ext4_ioctl,
index 889ec9d..8b5dd63 100644 (file)
@@ -168,7 +168,20 @@ struct mpage_da_data {
        int pages_written;
        int retval;
 };
-#define        EXT4_IO_UNWRITTEN       0x1
+
+/*
+ * Flags for ext4_io_end->flags
+ */
+#define        EXT4_IO_END_UNWRITTEN   0x0001
+#define EXT4_IO_END_ERROR      0x0002
+
+struct ext4_io_page {
+       struct page     *p_page;
+       int             p_count;
+};
+
+#define MAX_IO_PAGES 128
+
 typedef struct ext4_io_end {
        struct list_head        list;           /* per-file finished IO list */
        struct inode            *inode;         /* file being written to */
@@ -179,8 +192,18 @@ typedef struct ext4_io_end {
        struct work_struct      work;           /* data work queue */
        struct kiocb            *iocb;          /* iocb struct for AIO */
        int                     result;         /* error value for AIO */
+       int                     num_io_pages;
+       struct ext4_io_page     *pages[MAX_IO_PAGES];
 } ext4_io_end_t;
 
+struct ext4_io_submit {
+       int                     io_op;
+       struct bio              *io_bio;
+       ext4_io_end_t           *io_end;
+       struct ext4_io_page     *io_page;
+       sector_t                io_next_block;
+};
+
 /*
  * Special inodes numbers
  */
@@ -205,6 +228,7 @@ typedef struct ext4_io_end {
 #define EXT4_MIN_BLOCK_SIZE            1024
 #define        EXT4_MAX_BLOCK_SIZE             65536
 #define EXT4_MIN_BLOCK_LOG_SIZE                10
+#define EXT4_MAX_BLOCK_LOG_SIZE                16
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE(s)            ((s)->s_blocksize)
 #else
@@ -889,6 +913,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DATA_ERR_ABORT      0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY      0x20000000 /* Block validity checking */
 #define EXT4_MOUNT_DISCARD             0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INIT_INODE_TABLE    0x80000000 /* Initialize uninitialized itables */
 
 #define clear_opt(o, opt)              o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)                        o |= EXT4_MOUNT_##opt
@@ -1087,7 +1112,6 @@ struct ext4_sb_info {
        struct completion s_kobj_unregister;
 
        /* Journaling */
-       struct inode *s_journal_inode;
        struct journal_s *s_journal;
        struct list_head s_orphan;
        struct mutex s_orphan_lock;
@@ -1120,10 +1144,7 @@ struct ext4_sb_info {
        /* for buddy allocator */
        struct ext4_group_info ***s_group_info;
        struct inode *s_buddy_cache;
-       long s_blocks_reserved;
-       spinlock_t s_reserve_lock;
        spinlock_t s_md_lock;
-       tid_t s_last_transaction;
        unsigned short *s_mb_offsets;
        unsigned int *s_mb_maxs;
 
@@ -1141,7 +1162,6 @@ struct ext4_sb_info {
        unsigned long s_mb_last_start;
 
        /* stats for buddy allocator */
-       spinlock_t s_mb_pa_lock;
        atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
        atomic_t s_bal_success; /* we found long enough chunks */
        atomic_t s_bal_allocated;       /* in blocks */
@@ -1172,6 +1192,11 @@ struct ext4_sb_info {
 
        /* timer for periodic error stats printing */
        struct timer_list s_err_report;
+
+       /* Lazy inode table initialization info */
+       struct ext4_li_request *s_li_request;
+       /* Wait multiplier for lazy initialization thread */
+       unsigned int s_li_wait_mult;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1533,7 +1558,42 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
                        ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
 
-extern struct proc_dir_entry *ext4_proc_root;
+/*
+ * Timeout and state flag for lazy initialization inode thread.
+ */
+#define EXT4_DEF_LI_WAIT_MULT                  10
+#define EXT4_DEF_LI_MAX_START_DELAY            5
+#define EXT4_LAZYINIT_QUIT                     0x0001
+#define EXT4_LAZYINIT_RUNNING                  0x0002
+
+/*
+ * Lazy inode table initialization info
+ */
+struct ext4_lazy_init {
+       unsigned long           li_state;
+
+       wait_queue_head_t       li_wait_daemon;
+       wait_queue_head_t       li_wait_task;
+       struct timer_list       li_timer;
+       struct task_struct      *li_task;
+
+       struct list_head        li_request_list;
+       struct mutex            li_list_mtx;
+};
+
+struct ext4_li_request {
+       struct super_block      *lr_super;
+       struct ext4_sb_info     *lr_sbi;
+       ext4_group_t            lr_next_group;
+       struct list_head        lr_request;
+       unsigned long           lr_next_sched;
+       unsigned long           lr_timeout;
+};
+
+struct ext4_features {
+       struct kobject f_kobj;
+       struct completion f_kobj_unregister;
+};
 
 /*
  * Function prototypes
@@ -1561,7 +1621,6 @@ extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1605,11 +1664,9 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
-                                      struct buffer_head *bh,
-                                      ext4_group_t group,
-                                      struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
+extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
+extern int ext4_init_inode_table(struct super_block *sb,
+                                ext4_group_t group, int barrier);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1620,16 +1677,15 @@ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
 extern void ext4_discard_preallocations(struct inode *);
-extern int __init init_ext4_mballoc(void);
-extern void exit_ext4_mballoc(void);
+extern int __init ext4_init_mballoc(void);
+extern void ext4_exit_mballoc(void);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh, ext4_fsblk_t block,
                             unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
-extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
-extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
-                                               ext4_group_t, int);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
@@ -1657,13 +1713,11 @@ extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
-extern int flush_completed_IO(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
 /* ioctl.c */
@@ -1960,6 +2014,7 @@ extern const struct file_operations ext4_dir_operations;
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
+extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
@@ -1973,8 +2028,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
 /* block_validity */
 extern void ext4_release_system_zone(struct super_block *sb);
 extern int ext4_setup_system_zone(struct super_block *sb);
-extern int __init init_ext4_system_zone(void);
-extern void exit_ext4_system_zone(void);
+extern int __init ext4_init_system_zone(void);
+extern void ext4_exit_system_zone(void);
 extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
                                 ext4_fsblk_t start_blk,
                                 unsigned int count);
@@ -2002,6 +2057,17 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);
 
+/* page-io.c */
+extern int __init ext4_init_pageio(void);
+extern void ext4_exit_pageio(void);
+extern void ext4_free_io_end(ext4_io_end_t *io);
+extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern int ext4_end_io_nolock(ext4_io_end_t *io);
+extern void ext4_io_submit(struct ext4_io_submit *io);
+extern int ext4_bio_write_page(struct ext4_io_submit *io,
+                              struct page *page,
+                              int len,
+                              struct writeback_control *wbc);
 
 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
 enum ext4_state_bits {
index bdb6ce7..28ce70f 100644 (file)
@@ -225,11 +225,60 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
        ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
 }
 
+/*
+ * ext4_ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
+{
+       ext4_fsblk_t block;
+
+       block = le32_to_cpu(ex->ee_start_lo);
+       block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+       return block;
+}
+
+/*
+ * ext4_idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
+{
+       ext4_fsblk_t block;
+
+       block = le32_to_cpu(ix->ei_leaf_lo);
+       block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+       return block;
+}
+
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
+                                        ext4_fsblk_t pb)
+{
+       ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+       ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+                                     0xffff);
+}
+
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
+                                        ext4_fsblk_t pb)
+{
+       ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+       ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+                                    0xffff);
+}
+
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
                                         sector_t lblocks);
-extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
-extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
-extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
@@ -237,19 +286,9 @@ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
 extern int ext4_can_extents_be_merged(struct inode *inode,
                                      struct ext4_extent *ex1,
                                      struct ext4_extent *ex2);
-extern int ext4_ext_try_to_merge(struct inode *inode,
-                                struct ext4_ext_path *path,
-                                struct ext4_extent *);
-extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
-extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
-                                                       ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
                                                        struct ext4_ext_path *);
-extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
-                                               ext4_lblk_t *, ext4_fsblk_t *);
-extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
-                                               ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
index 06328d3..0554c48 100644 (file)
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 
-
-/*
- * ext_pblock:
- * combine low and high parts of physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
-{
-       ext4_fsblk_t block;
-
-       block = le32_to_cpu(ex->ee_start_lo);
-       block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
-       return block;
-}
-
-/*
- * idx_pblock:
- * combine low and high parts of a leaf physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
-{
-       ext4_fsblk_t block;
-
-       block = le32_to_cpu(ix->ei_leaf_lo);
-       block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
-       return block;
-}
-
-/*
- * ext4_ext_store_pblock:
- * stores a large physical block number into an extent struct,
- * breaking it into parts
- */
-void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
-{
-       ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-       ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
-
-/*
- * ext4_idx_store_pblock:
- * stores a large physical block number into an index struct,
- * breaking it into parts
- */
-static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
-{
-       ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-       ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
-}
-
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
                                            int needed)
@@ -169,7 +120,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                /* try to predict block placement */
                ex = path[depth].p_ext;
                if (ex)
-                       return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+                       return (ext4_ext_pblock(ex) +
+                               (block - le32_to_cpu(ex->ee_block)));
 
                /* it looks like index is empty;
                 * try to find starting block from index itself */
@@ -354,7 +306,7 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-       ext4_fsblk_t block = ext_pblock(ext);
+       ext4_fsblk_t block = ext4_ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
 
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
@@ -363,7 +315,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
 {
-       ext4_fsblk_t block = idx_pblock(ext_idx);
+       ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
 
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 }
@@ -463,13 +415,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
        for (k = 0; k <= l; k++, path++) {
                if (path->p_idx) {
                  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
-                           idx_pblock(path->p_idx));
+                           ext4_idx_pblock(path->p_idx));
                } else if (path->p_ext) {
                        ext_debug("  %d:[%d]%d:%llu ",
                                  le32_to_cpu(path->p_ext->ee_block),
                                  ext4_ext_is_uninitialized(path->p_ext),
                                  ext4_ext_get_actual_len(path->p_ext),
-                                 ext_pblock(path->p_ext));
+                                 ext4_ext_pblock(path->p_ext));
                } else
                        ext_debug("  []");
        }
@@ -494,7 +446,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
                ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
                          ext4_ext_is_uninitialized(ex),
-                         ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                         ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
        }
        ext_debug("\n");
 }
@@ -545,7 +497,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
 
        path->p_idx = l - 1;
        ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
-                 idx_pblock(path->p_idx));
+                 ext4_idx_pblock(path->p_idx));
 
 #ifdef CHECK_BINSEARCH
        {
@@ -614,7 +566,7 @@ ext4_ext_binsearch(struct inode *inode,
        path->p_ext = l - 1;
        ext_debug("  -> %d:%llu:[%d]%d ",
                        le32_to_cpu(path->p_ext->ee_block),
-                       ext_pblock(path->p_ext),
+                       ext4_ext_pblock(path->p_ext),
                        ext4_ext_is_uninitialized(path->p_ext),
                        ext4_ext_get_actual_len(path->p_ext));
 
@@ -682,7 +634,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 
                ext4_ext_binsearch_idx(inode, path + ppos, block);
-               path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+               path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                path[ppos].p_depth = i;
                path[ppos].p_ext = NULL;
 
@@ -721,7 +673,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        ext4_ext_binsearch(inode, path + ppos, block);
        /* if not an empty leaf */
        if (path[ppos].p_ext)
-               path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+               path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
 
        ext4_ext_show_path(inode, path);
 
@@ -739,9 +691,9 @@ err:
  * insert new index [@logical;@ptr] into the block at @curp;
  * check where to insert: before @curp or after @curp
  */
-int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
-                               struct ext4_ext_path *curp,
-                               int logical, ext4_fsblk_t ptr)
+static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+                                struct ext4_ext_path *curp,
+                                int logical, ext4_fsblk_t ptr)
 {
        struct ext4_extent_idx *ix;
        int len, err;
@@ -917,7 +869,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                        EXT_MAX_EXTENT(path[depth].p_hdr)) {
                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
                                le32_to_cpu(path[depth].p_ext->ee_block),
-                               ext_pblock(path[depth].p_ext),
+                               ext4_ext_pblock(path[depth].p_ext),
                                ext4_ext_is_uninitialized(path[depth].p_ext),
                                ext4_ext_get_actual_len(path[depth].p_ext),
                                newblock);
@@ -1007,7 +959,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
                        ext_debug("%d: move %d:%llu in new index %llu\n", i,
                                        le32_to_cpu(path[i].p_idx->ei_block),
-                                       idx_pblock(path[i].p_idx),
+                                       ext4_idx_pblock(path[i].p_idx),
                                        newblock);
                        /*memmove(++fidx, path[i].p_idx++,
                                        sizeof(struct ext4_extent_idx));
@@ -1146,7 +1098,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
-                 idx_pblock(EXT_FIRST_INDEX(neh)));
+                 ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
 
        neh->eh_depth = cpu_to_le16(path->p_depth + 1);
        err = ext4_ext_dirty(handle, inode, curp);
@@ -1232,9 +1184,9 @@ out:
  * returns 0 at @phys
  * return value contains 0 (success) or error code
  */
-int
-ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
-                       ext4_lblk_t *logical, ext4_fsblk_t *phys)
+static int ext4_ext_search_left(struct inode *inode,
+                               struct ext4_ext_path *path,
+                               ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
        struct ext4_extent_idx *ix;
        struct ext4_extent *ex;
@@ -1286,7 +1238,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
        }
 
        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
-       *phys = ext_pblock(ex) + ee_len - 1;
+       *phys = ext4_ext_pblock(ex) + ee_len - 1;
        return 0;
 }
 
@@ -1297,9 +1249,9 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
  * returns 0 at @phys
  * return value contains 0 (success) or error code
  */
-int
-ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
-                       ext4_lblk_t *logical, ext4_fsblk_t *phys)
+static int ext4_ext_search_right(struct inode *inode,
+                                struct ext4_ext_path *path,
+                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
        struct buffer_head *bh = NULL;
        struct ext4_extent_header *eh;
@@ -1342,7 +1294,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
                        }
                }
                *logical = le32_to_cpu(ex->ee_block);
-               *phys = ext_pblock(ex);
+               *phys = ext4_ext_pblock(ex);
                return 0;
        }
 
@@ -1357,7 +1309,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
                /* next allocated block in this leaf */
                ex++;
                *logical = le32_to_cpu(ex->ee_block);
-               *phys = ext_pblock(ex);
+               *phys = ext4_ext_pblock(ex);
                return 0;
        }
 
@@ -1376,7 +1328,7 @@ got_index:
         * follow it and find the closest allocated
         * block to the right */
        ix++;
-       block = idx_pblock(ix);
+       block = ext4_idx_pblock(ix);
        while (++depth < path->p_depth) {
                bh = sb_bread(inode->i_sb, block);
                if (bh == NULL)
@@ -1388,7 +1340,7 @@ got_index:
                        return -EIO;
                }
                ix = EXT_FIRST_INDEX(eh);
-               block = idx_pblock(ix);
+               block = ext4_idx_pblock(ix);
                put_bh(bh);
        }
 
@@ -1402,7 +1354,7 @@ got_index:
        }
        ex = EXT_FIRST_EXTENT(eh);
        *logical = le32_to_cpu(ex->ee_block);
-       *phys = ext_pblock(ex);
+       *phys = ext4_ext_pblock(ex);
        put_bh(bh);
        return 0;
 }
@@ -1573,7 +1525,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
                return 0;
 #endif
 
-       if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
+       if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
                return 1;
        return 0;
 }
@@ -1585,9 +1537,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
  * 1 if they got merged.
  */
-int ext4_ext_try_to_merge(struct inode *inode,
-                         struct ext4_ext_path *path,
-                         struct ext4_extent *ex)
+static int ext4_ext_try_to_merge(struct inode *inode,
+                                struct ext4_ext_path *path,
+                                struct ext4_extent *ex)
 {
        struct ext4_extent_header *eh;
        unsigned int depth, len;
@@ -1632,9 +1584,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
  * such that there will be no overlap, and then returns 1.
  * If there is no overlap found, it returns 0.
  */
-unsigned int ext4_ext_check_overlap(struct inode *inode,
-                                   struct ext4_extent *newext,
-                                   struct ext4_ext_path *path)
+static unsigned int ext4_ext_check_overlap(struct inode *inode,
+                                          struct ext4_extent *newext,
+                                          struct ext4_ext_path *path)
 {
        ext4_lblk_t b1, b2;
        unsigned int depth, len1;
@@ -1706,11 +1658,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
                && ext4_can_extents_be_merged(inode, ex, newext)) {
                ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
-                               ext4_ext_is_uninitialized(newext),
-                               ext4_ext_get_actual_len(newext),
-                               le32_to_cpu(ex->ee_block),
-                               ext4_ext_is_uninitialized(ex),
-                               ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                         ext4_ext_is_uninitialized(newext),
+                         ext4_ext_get_actual_len(newext),
+                         le32_to_cpu(ex->ee_block),
+                         ext4_ext_is_uninitialized(ex),
+                         ext4_ext_get_actual_len(ex),
+                         ext4_ext_pblock(ex));
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        return err;
@@ -1780,7 +1733,7 @@ has_space:
                /* there is no extent in this leaf, create first one */
                ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
-                               ext_pblock(newext),
+                               ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext));
                path[depth].p_ext = EXT_FIRST_EXTENT(eh);
@@ -1794,7 +1747,7 @@ has_space:
                        ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
                                        "move %d from 0x%p to 0x%p\n",
                                        le32_to_cpu(newext->ee_block),
-                                       ext_pblock(newext),
+                                       ext4_ext_pblock(newext),
                                        ext4_ext_is_uninitialized(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex, len, nearex + 1, nearex + 2);
@@ -1808,7 +1761,7 @@ has_space:
                ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
                                "move %d from 0x%p to 0x%p\n",
                                le32_to_cpu(newext->ee_block),
-                               ext_pblock(newext),
+                               ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext),
                                nearex, len, nearex + 1, nearex + 2);
@@ -1819,7 +1772,7 @@ has_space:
        le16_add_cpu(&eh->eh_entries, 1);
        nearex = path[depth].p_ext;
        nearex->ee_block = newext->ee_block;
-       ext4_ext_store_pblock(nearex, ext_pblock(newext));
+       ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
        nearex->ee_len = newext->ee_len;
 
 merge:
@@ -1845,9 +1798,9 @@ cleanup:
        return err;
 }
 
-int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
-                       ext4_lblk_t num, ext_prepare_callback func,
-                       void *cbdata)
+static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+                              ext4_lblk_t num, ext_prepare_callback func,
+                              void *cbdata)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_ext_cache cbex;
@@ -1923,7 +1876,7 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                } else {
                        cbex.ec_block = le32_to_cpu(ex->ee_block);
                        cbex.ec_len = ext4_ext_get_actual_len(ex);
-                       cbex.ec_start = ext_pblock(ex);
+                       cbex.ec_start = ext4_ext_pblock(ex);
                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
                }
 
@@ -2073,7 +2026,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 
        /* free index block */
        path--;
-       leaf = idx_pblock(path->p_idx);
+       leaf = ext4_idx_pblock(path->p_idx);
        if (unlikely(path->p_hdr->eh_entries == 0)) {
                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
                return -EIO;
@@ -2181,7 +2134,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t start;
 
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
-               start = ext_pblock(ex) + ee_len - num;
+               start = ext4_ext_pblock(ex) + ee_len - num;
                ext_debug("free last %u blocks starting %llu\n", num, start);
                ext4_free_blocks(handle, inode, 0, start, num, flags);
        } else if (from == le32_to_cpu(ex->ee_block)
@@ -2310,7 +2263,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        goto out;
 
                ext_debug("new extent: %u:%u:%llu\n", block, num,
-                               ext_pblock(ex));
+                               ext4_ext_pblock(ex));
                ex--;
                ex_ee_block = le32_to_cpu(ex->ee_block);
                ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2421,9 +2374,9 @@ again:
                        struct buffer_head *bh;
                        /* go to the next level */
                        ext_debug("move to level %d (block %llu)\n",
-                                 i + 1, idx_pblock(path[i].p_idx));
+                                 i + 1, ext4_idx_pblock(path[i].p_idx));
                        memset(path + i + 1, 0, sizeof(*path));
-                       bh = sb_bread(sb, idx_pblock(path[i].p_idx));
+                       bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
                        if (!bh) {
                                /* should we reset i_size? */
                                err = -EIO;
@@ -2535,77 +2488,21 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
 
-static void bi_complete(struct bio *bio, int error)
-{
-       complete((struct completion *)bio->bi_private);
-}
-
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
+       ext4_fsblk_t ee_pblock;
+       unsigned int ee_len;
        int ret;
-       struct bio *bio;
-       int blkbits, blocksize;
-       sector_t ee_pblock;
-       struct completion event;
-       unsigned int ee_len, len, done, offset;
 
-
-       blkbits   = inode->i_blkbits;
-       blocksize = inode->i_sb->s_blocksize;
        ee_len    = ext4_ext_get_actual_len(ex);
-       ee_pblock = ext_pblock(ex);
-
-       /* convert ee_pblock to 512 byte sectors */
-       ee_pblock = ee_pblock << (blkbits - 9);
-
-       while (ee_len > 0) {
-
-               if (ee_len > BIO_MAX_PAGES)
-                       len = BIO_MAX_PAGES;
-               else
-                       len = ee_len;
-
-               bio = bio_alloc(GFP_NOIO, len);
-               if (!bio)
-                       return -ENOMEM;
-
-               bio->bi_sector = ee_pblock;
-               bio->bi_bdev   = inode->i_sb->s_bdev;
-
-               done = 0;
-               offset = 0;
-               while (done < len) {
-                       ret = bio_add_page(bio, ZERO_PAGE(0),
-                                                       blocksize, offset);
-                       if (ret != blocksize) {
-                               /*
-                                * We can't add any more pages because of
-                                * hardware limitations.  Start a new bio.
-                                */
-                               break;
-                       }
-                       done++;
-                       offset += blocksize;
-                       if (offset >= PAGE_CACHE_SIZE)
-                               offset = 0;
-               }
+       ee_pblock = ext4_ext_pblock(ex);
 
-               init_completion(&event);
-               bio->bi_private = &event;
-               bio->bi_end_io = bi_complete;
-               submit_bio(WRITE, bio);
-               wait_for_completion(&event);
+       ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
+       if (ret > 0)
+               ret = 0;
 
-               if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-                       bio_put(bio);
-                       return -EIO;
-               }
-               bio_put(bio);
-               ee_len    -= done;
-               ee_pblock += done  << (blkbits - 9);
-       }
-       return 0;
+       return ret;
 }
 
 #define EXT4_EXT_ZERO_LEN 7
@@ -2651,12 +2548,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
-       newblock = map->m_lblk - ee_block + ext_pblock(ex);
+       newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
 
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
-       ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+       ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
 
        /*
         * It is safe to convert extent to initialized via explicit
@@ -2675,7 +2572,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-               ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+               ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zeroed the full extent */
                return allocated;
@@ -2710,7 +2607,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = cpu_to_le16(ee_len - allocated);
                        ext4_ext_mark_uninitialized(ex);
-                       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
 
                        ex3 = &newex;
@@ -2725,7 +2622,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                        goto fix_extent_len;
                                ex->ee_block = orig_ex.ee_block;
                                ex->ee_len   = orig_ex.ee_len;
-                               ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                               ext4_ext_store_pblock(ex,
+                                       ext4_ext_pblock(&orig_ex));
                                ext4_ext_dirty(handle, inode, path + depth);
                                /* blocks available from map->m_lblk */
                                return allocated;
@@ -2782,7 +2680,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
                        /* blocks available from map->m_lblk */
@@ -2833,7 +2731,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zero out the first half */
                        /* blocks available from map->m_lblk */
@@ -2902,7 +2800,7 @@ insert:
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-               ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+               ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zero out the first half */
                return allocated;
@@ -2915,7 +2813,7 @@ out:
 fix_extent_len:
        ex->ee_block = orig_ex.ee_block;
        ex->ee_len   = orig_ex.ee_len;
-       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
        ext4_ext_mark_uninitialized(ex);
        ext4_ext_dirty(handle, inode, path + depth);
        return err;
@@ -2973,12 +2871,12 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
-       newblock = map->m_lblk - ee_block + ext_pblock(ex);
+       newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
 
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
-       ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+       ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
 
        /*
         * It is safe to convert extent to initialized via explicit
@@ -3027,7 +2925,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                        /* update the extent length and mark as initialized */
                        ex->ee_block = orig_ex.ee_block;
                        ex->ee_len   = orig_ex.ee_len;
-                       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
                        /* blocks available from map->m_lblk */
@@ -3099,7 +2997,7 @@ insert:
                /* update the extent length and mark as initialized */
                ex->ee_block = orig_ex.ee_block;
                ex->ee_len   = orig_ex.ee_len;
-               ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+               ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                ext4_ext_dirty(handle, inode, path + depth);
                /* zero out the first half */
                return allocated;
@@ -3112,7 +3010,7 @@ out:
 fix_extent_len:
        ex->ee_block = orig_ex.ee_block;
        ex->ee_len   = orig_ex.ee_len;
-       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
        ext4_ext_mark_uninitialized(ex);
        ext4_ext_dirty(handle, inode, path + depth);
        return err;
@@ -3180,6 +3078,57 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
                 unmap_underlying_metadata(bdev, block + i);
 }
 
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+                             struct ext4_map_blocks *map,
+                             struct ext4_ext_path *path,
+                             unsigned int len)
+{
+       int i, depth;
+       struct ext4_extent_header *eh;
+       struct ext4_extent *ex, *last_ex;
+
+       if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+               return 0;
+
+       depth = ext_depth(inode);
+       eh = path[depth].p_hdr;
+       ex = path[depth].p_ext;
+
+       if (unlikely(!eh->eh_entries)) {
+               EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+                                "EOFBLOCKS_FL set");
+               return -EIO;
+       }
+       last_ex = EXT_LAST_EXTENT(eh);
+       /*
+        * We should clear the EOFBLOCKS_FL flag if we are writing the
+        * last block in the last extent in the file.  We test this by
+        * first checking to see if the caller to
+        * ext4_ext_get_blocks() was interested in the last block (or
+        * a block beyond the last block) in the current extent.  If
+        * this turns out to be false, we can bail out from this
+        * function immediately.
+        */
+       if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+           ext4_ext_get_actual_len(last_ex))
+               return 0;
+       /*
+        * If the caller does appear to be planning to write at or
+        * beyond the end of the current extent, we then test to see
+        * if the current extent is the last extent in the file, by
+        * checking to make sure it was reached via the rightmost node
+        * at each level of the tree.
+        */
+       for (i = depth-1; i >= 0; i--)
+               if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+                       return 0;
+       ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+       return ext4_mark_inode_dirty(handle, inode);
+}
+
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
@@ -3206,7 +3155,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * completed
                 */
                if (io)
-                       io->flag = EXT4_IO_UNWRITTEN;
+                       io->flag = EXT4_IO_END_UNWRITTEN;
                else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
@@ -3217,8 +3166,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
                ret = ext4_convert_unwritten_extents_endio(handle, inode,
                                                        path);
-               if (ret >= 0)
+               if (ret >= 0) {
                        ext4_update_inode_fsync_trans(handle, inode, 1);
+                       err = check_eofblocks_fl(handle, inode, map, path,
+                                                map->m_len);
+               } else
+                       err = ret;
                goto out2;
        }
        /* buffered IO case */
@@ -3244,8 +3197,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
        /* buffered write, writepage time, convert*/
        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-       if (ret >= 0)
+       if (ret >= 0) {
                ext4_update_inode_fsync_trans(handle, inode, 1);
+               err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
+               if (err < 0)
+                       goto out2;
+       }
+
 out:
        if (ret <= 0) {
                err = ret;
@@ -3292,6 +3250,7 @@ out2:
        }
        return err ? err : allocated;
 }
+
 /*
  * Block allocation/map/preallocation routine for extents based files
  *
@@ -3315,9 +3274,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
-       struct ext4_extent newex, *ex, *last_ex;
+       struct ext4_extent newex, *ex;
        ext4_fsblk_t newblock;
-       int i, err = 0, depth, ret, cache_type;
+       int err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3341,7 +3300,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        /* block is already allocated */
                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
-                                  + ext_pblock(&newex);
+                                  + ext4_ext_pblock(&newex);
                        /* number of remaining blocks in the extent */
                        allocated = ext4_ext_get_actual_len(&newex) -
                                (map->m_lblk - le32_to_cpu(newex.ee_block));
@@ -3379,7 +3338,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        ex = path[depth].p_ext;
        if (ex) {
                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
-               ext4_fsblk_t ee_start = ext_pblock(ex);
+               ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;
 
                /*
@@ -3488,7 +3447,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                        if (io)
-                               io->flag = EXT4_IO_UNWRITTEN;
+                               io->flag = EXT4_IO_END_UNWRITTEN;
                        else
                                ext4_set_inode_state(inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
@@ -3497,44 +3456,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
 
-       if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
-               if (unlikely(!eh->eh_entries)) {
-                       EXT4_ERROR_INODE(inode,
-                                        "eh->eh_entries == 0 and "
-                                        "EOFBLOCKS_FL set");
-                       err = -EIO;
-                       goto out2;
-               }
-               last_ex = EXT_LAST_EXTENT(eh);
-               /*
-                * If the current leaf block was reached by looking at
-                * the last index block all the way down the tree, and
-                * we are extending the inode beyond the last extent
-                * in the current leaf block, then clear the
-                * EOFBLOCKS_FL flag.
-                */
-               for (i = depth-1; i >= 0; i--) {
-                       if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
-                               break;
-               }
-               if ((i < 0) &&
-                   (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
-                    ext4_ext_get_actual_len(last_ex)))
-                       ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-       }
+       err = check_eofblocks_fl(handle, inode, map, path, ar.len);
+       if (err)
+               goto out2;
+
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
                /* free data blocks we just allocated */
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
-               ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
+               ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
                                 ext4_ext_get_actual_len(&newex), 0);
                goto out2;
        }
 
        /* previous routine could use block we allocated */
-       newblock = ext_pblock(&newex);
+       newblock = ext4_ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
        if (allocated > map->m_len)
                allocated = map->m_len;
@@ -3729,7 +3667,7 @@ retry:
                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
-                                   inode->i_ino, block, max_blocks);
+                                   inode->i_ino, map.m_lblk, max_blocks);
 #endif
                        ext4_mark_inode_dirty(handle, inode);
                        ret2 = ext4_journal_stop(handle);
index ee92b66..5a5c55d 100644 (file)
@@ -130,8 +130,50 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
        return dquot_file_open(inode, filp);
 }
 
+/*
+ * ext4_llseek() copied from generic_file_llseek() to handle both
+ * block-mapped and extent-mapped maxbytes values. This should
+ * otherwise be identical with generic_file_llseek().
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
+{
+       struct inode *inode = file->f_mapping->host;
+       loff_t maxbytes;
+
+       if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+               maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+       else
+               maxbytes = inode->i_sb->s_maxbytes;
+       mutex_lock(&inode->i_mutex);
+       switch (origin) {
+       case SEEK_END:
+               offset += inode->i_size;
+               break;
+       case SEEK_CUR:
+               if (offset == 0) {
+                       mutex_unlock(&inode->i_mutex);
+                       return file->f_pos;
+               }
+               offset += file->f_pos;
+               break;
+       }
+
+       if (offset < 0 || offset > maxbytes) {
+               mutex_unlock(&inode->i_mutex);
+               return -EINVAL;
+       }
+
+       if (offset != file->f_pos) {
+               file->f_pos = offset;
+               file->f_version = 0;
+       }
+       mutex_unlock(&inode->i_mutex);
+
+       return offset;
+}
+
 const struct file_operations ext4_file_operations = {
-       .llseek         = generic_file_llseek,
+       .llseek         = ext4_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
index 3f3ff5e..c1a7bc9 100644 (file)
 
 #include <trace/events/ext4.h>
 
+static void dump_completed_IO(struct inode * inode)
+{
+#ifdef EXT4_DEBUG
+       struct list_head *cur, *before, *after;
+       ext4_io_end_t *io, *io0, *io1;
+       unsigned long flags;
+
+       if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+               ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
+               return;
+       }
+
+       ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+       list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
+               cur = &io->list;
+               before = cur->prev;
+               io0 = container_of(before, ext4_io_end_t, list);
+               after = cur->next;
+               io1 = container_of(after, ext4_io_end_t, list);
+
+               ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+                           io, inode->i_ino, io0, io1);
+       }
+       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+#endif
+}
+
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
+ */
+static int flush_completed_IO(struct inode *inode)
+{
+       ext4_io_end_t *io;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       unsigned long flags;
+       int ret = 0;
+       int ret2 = 0;
+
+       if (list_empty(&ei->i_completed_io_list))
+               return ret;
+
+       dump_completed_IO(inode);
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+       while (!list_empty(&ei->i_completed_io_list)){
+               io = list_entry(ei->i_completed_io_list.next,
+                               ext4_io_end_t, list);
+               /*
+                * Calling ext4_end_io_nolock() to convert completed
+                * IO to written.
+                *
+                * When ext4_sync_file() is called, run_queue() may already
+                * about to flush the work corresponding to this io structure.
+                * It will be upset if it founds the io structure related
+                * to the work-to-be schedule is freed.
+                *
+                * Thus we need to keep the io structure still valid here after
+                * convertion finished. The io structure has a flag to
+                * avoid double converting from both fsync and background work
+                * queue work.
+                */
+               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+               ret = ext4_end_io_nolock(io);
+               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+               if (ret < 0)
+                       ret2 = ret;
+               else
+                       list_del_init(&io->list);
+       }
+       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+       return (ret2 < 0) ? ret2 : 0;
+}
+
 /*
  * If we're not journaling and this is a just-created file, we have to
  * sync our parent directory (if it was freshly created) since
index 45853e0..1ce240a 100644 (file)
@@ -50,7 +50,7 @@
  * need to use it within a single byte (to ensure we get endianness right).
  * We can use memset for the rest of the bitmap as there are no other users.
  */
-void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 {
        int i;
 
@@ -65,9 +65,10 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 }
 
 /* Initializes an uninitialized inode bitmap */
-unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
-                               ext4_group_t block_group,
-                               struct ext4_group_desc *gdp)
+static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+                                      struct buffer_head *bh,
+                                      ext4_group_t block_group,
+                                      struct ext4_group_desc *gdp)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 
@@ -85,7 +86,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
        }
 
        memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+       ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                        bh->b_data);
 
        return EXT4_INODES_PER_GROUP(sb);
@@ -107,6 +108,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return NULL;
+
        bitmap_blk = ext4_inode_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
@@ -123,6 +125,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
+
        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +136,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                return bh;
        }
        ext4_unlock_group(sb, block_group);
+
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -411,8 +415,8 @@ struct orlov_stats {
  * for a particular block group or flex_bg.  If flex_size is 1, then g
  * is a block group number; otherwise it is flex_bg number.
  */
-void get_orlov_stats(struct super_block *sb, ext4_group_t g,
-                      int flex_size, struct orlov_stats *stats)
+static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+                           int flex_size, struct orlov_stats *stats)
 {
        struct ext4_group_desc *desc;
        struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
@@ -712,8 +716,17 @@ static int ext4_claim_inode(struct super_block *sb,
 {
        int free = 0, retval = 0, count;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 
+       /*
+        * We have to be sure that new inode allocation does not race with
+        * inode table initialization, because otherwise we may end up
+        * allocating and writing new inode right before sb_issue_zeroout
+        * takes place and overwriting our new inode with zeroes. So we
+        * take alloc_sem to prevent it.
+        */
+       down_read(&grp->alloc_sem);
        ext4_lock_group(sb, group);
        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
                /* not a free inode */
@@ -724,6 +737,7 @@ static int ext4_claim_inode(struct super_block *sb,
        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                        ino > EXT4_INODES_PER_GROUP(sb)) {
                ext4_unlock_group(sb, group);
+               up_read(&grp->alloc_sem);
                ext4_error(sb, "reserved inode or inode > inodes count - "
                           "block_group = %u, inode=%lu", group,
                           ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +786,7 @@ static int ext4_claim_inode(struct super_block *sb,
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
        ext4_unlock_group(sb, group);
+       up_read(&grp->alloc_sem);
        return retval;
 }
 
@@ -1205,3 +1220,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
        }
        return count;
 }
+
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_claim_inode until we are finished.
+ */
+extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+                                int barrier)
+{
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_group_desc *gdp = NULL;
+       struct buffer_head *group_desc_bh;
+       handle_t *handle;
+       ext4_fsblk_t blk;
+       int num, ret = 0, used_blks = 0;
+
+       /* This should not happen, but just to be sure check this */
+       if (sb->s_flags & MS_RDONLY) {
+               ret = 1;
+               goto out;
+       }
+
+       gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+       if (!gdp)
+               goto out;
+
+       /*
+        * We do not need to lock this, because we are the only one
+        * handling this flag.
+        */
+       if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+               goto out;
+
+       handle = ext4_journal_start_sb(sb, 1);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               goto out;
+       }
+
+       down_write(&grp->alloc_sem);
+       /*
+        * If inode bitmap was already initialized there may be some
+        * used inodes so we need to skip blocks with used inodes in
+        * inode table.
+        */
+       if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+               used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+                           ext4_itable_unused_count(sb, gdp)),
+                           sbi->s_inodes_per_block);
+
+       if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+               ext4_error(sb, "Something is wrong with group %u\n"
+                          "Used itable blocks: %d"
+                          "itable unused count: %u\n",
+                          group, used_blks,
+                          ext4_itable_unused_count(sb, gdp));
+               ret = 1;
+               goto out;
+       }
+
+       blk = ext4_inode_table(sb, gdp) + used_blks;
+       num = sbi->s_itb_per_group - used_blks;
+
+       BUFFER_TRACE(group_desc_bh, "get_write_access");
+       ret = ext4_journal_get_write_access(handle,
+                                           group_desc_bh);
+       if (ret)
+               goto err_out;
+
+       /*
+        * Skip zeroout if the inode table is full. But we set the ZEROED
+        * flag anyway, because obviously, when it is full it does not need
+        * further zeroing.
+        */
+       if (unlikely(num == 0))
+               goto skip_zeroout;
+
+       ext4_debug("going to zero out inode table in group %d\n",
+                  group);
+       ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
+       if (ret < 0)
+               goto err_out;
+       if (barrier)
+               blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+
+skip_zeroout:
+       ext4_lock_group(sb, group);
+       gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+       ext4_unlock_group(sb, group);
+
+       BUFFER_TRACE(group_desc_bh,
+                    "call ext4_handle_dirty_metadata");
+       ret = ext4_handle_dirty_metadata(handle, NULL,
+                                        group_desc_bh);
+
+err_out:
+       up_write(&grp->alloc_sem);
+       ext4_journal_stop(handle);
+out:
+       return ret;
+}
index 49635ef..2d6c6c8 100644 (file)
@@ -60,6 +60,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 }
 
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create);
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -755,6 +761,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                 * parent to disk.
                 */
                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+               if (unlikely(!bh)) {
+                       err = -EIO;
+                       goto failed;
+               }
+
                branch[n].bh = bh;
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
@@ -1207,8 +1218,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
                                break;
                        idx++;
                        num++;
-                       if (num >= max_pages)
+                       if (num >= max_pages) {
+                               done = 1;
                                break;
+                       }
                }
                pagevec_release(&pvec);
        }
@@ -1995,16 +2008,23 @@ static void ext4_da_page_release_reservation(struct page *page,
  *
  * As pages are already locked by write_cache_pages(), we can't use it
  */
-static int mpage_da_submit_io(struct mpage_da_data *mpd)
+static int mpage_da_submit_io(struct mpage_da_data *mpd,
+                             struct ext4_map_blocks *map)
 {
-       long pages_skipped;
        struct pagevec pvec;
        unsigned long index, end;
        int ret = 0, err, nr_pages, i;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
+       loff_t size = i_size_read(inode);
+       unsigned int len, block_start;
+       struct buffer_head *bh, *page_bufs = NULL;
+       int journal_data = ext4_should_journal_data(inode);
+       sector_t pblock = 0, cur_logical = 0;
+       struct ext4_io_submit io_submit;
 
        BUG_ON(mpd->next_page <= mpd->first_page);
+       memset(&io_submit, 0, sizeof(io_submit));
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
@@ -2020,122 +2040,108 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
+                       int commit_write = 0, redirty_page = 0;
                        struct page *page = pvec.pages[i];
 
                        index = page->index;
                        if (index > end)
                                break;
+
+                       if (index == size >> PAGE_CACHE_SHIFT)
+                               len = size & ~PAGE_CACHE_MASK;
+                       else
+                               len = PAGE_CACHE_SIZE;
+                       if (map) {
+                               cur_logical = index << (PAGE_CACHE_SHIFT -
+                                                       inode->i_blkbits);
+                               pblock = map->m_pblk + (cur_logical -
+                                                       map->m_lblk);
+                       }
                        index++;
 
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
 
-                       pages_skipped = mpd->wbc->pages_skipped;
-                       err = mapping->a_ops->writepage(page, mpd->wbc);
-                       if (!err && (pages_skipped == mpd->wbc->pages_skipped))
-                               /*
-                                * have successfully written the page
-                                * without skipping the same
-                                */
-                               mpd->pages_written++;
                        /*
-                        * In error case, we have to continue because
-                        * remaining pages are still locked
-                        * XXX: unlock and re-dirty them?
+                        * If the page does not have buffers (for
+                        * whatever reason), try to create them using
+                        * __block_write_begin.  If this fails,
+                        * redirty the page and move on.
                         */
-                       if (ret == 0)
-                               ret = err;
-               }
-               pagevec_release(&pvec);
-       }
-       return ret;
-}
-
-/*
- * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
- *
- * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
- */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-                                struct ext4_map_blocks *map)
-{
-       struct inode *inode = mpd->inode;
-       struct address_space *mapping = inode->i_mapping;
-       int blocks = map->m_len;
-       sector_t pblock = map->m_pblk, cur_logical;
-       struct buffer_head *head, *bh;
-       pgoff_t index, end;
-       struct pagevec pvec;
-       int nr_pages, i;
-
-       index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-       end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-       cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
-       pagevec_init(&pvec, 0);
-
-       while (index <= end) {
-               /* XXX: optimize tail */
-               nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-               if (nr_pages == 0)
-                       break;
-               for (i = 0; i < nr_pages; i++) {
-                       struct page *page = pvec.pages[i];
-
-                       index = page->index;
-                       if (index > end)
-                               break;
-                       index++;
-
-                       BUG_ON(!PageLocked(page));
-                       BUG_ON(PageWriteback(page));
-                       BUG_ON(!page_has_buffers(page));
-
-                       bh = page_buffers(page);
-                       head = bh;
-
-                       /* skip blocks out of the range */
-                       do {
-                               if (cur_logical >= map->m_lblk)
-                                       break;
-                               cur_logical++;
-                       } while ((bh = bh->b_this_page) != head);
+                       if (!page_has_buffers(page)) {
+                               if (__block_write_begin(page, 0, len,
+                                               noalloc_get_block_write)) {
+                               redirty_page:
+                                       redirty_page_for_writepage(mpd->wbc,
+                                                                  page);
+                                       unlock_page(page);
+                                       continue;
+                               }
+                               commit_write = 1;
+                       }
 
+                       bh = page_bufs = page_buffers(page);
+                       block_start = 0;
                        do {
-                               if (cur_logical >= map->m_lblk + blocks)
-                                       break;
-
-                               if (buffer_delay(bh) || buffer_unwritten(bh)) {
-
-                                       BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
-
+                               if (!bh)
+                                       goto redirty_page;
+                               if (map && (cur_logical >= map->m_lblk) &&
+                                   (cur_logical <= (map->m_lblk +
+                                                    (map->m_len - 1)))) {
                                        if (buffer_delay(bh)) {
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
-                                       } else {
-                                               /*
-                                                * unwritten already should have
-                                                * blocknr assigned. Verify that
-                                                */
-                                               clear_buffer_unwritten(bh);
-                                               BUG_ON(bh->b_blocknr != pblock);
                                        }
+                                       if (buffer_unwritten(bh) ||
+                                           buffer_mapped(bh))
+                                               BUG_ON(bh->b_blocknr != pblock);
+                                       if (map->m_flags & EXT4_MAP_UNINIT)
+                                               set_buffer_uninit(bh);
+                                       clear_buffer_unwritten(bh);
+                               }
 
-                               } else if (buffer_mapped(bh))
-                                       BUG_ON(bh->b_blocknr != pblock);
-
-                               if (map->m_flags & EXT4_MAP_UNINIT)
-                                       set_buffer_uninit(bh);
+                               /* redirty page if block allocation undone */
+                               if (buffer_delay(bh) || buffer_unwritten(bh))
+                                       redirty_page = 1;
+                               bh = bh->b_this_page;
+                               block_start += bh->b_size;
                                cur_logical++;
                                pblock++;
-                       } while ((bh = bh->b_this_page) != head);
+                       } while (bh != page_bufs);
+
+                       if (redirty_page)
+                               goto redirty_page;
+
+                       if (commit_write)
+                               /* mark the buffer_heads as dirty & uptodate */
+                               block_commit_write(page, 0, len);
+
+                       /*
+                        * Delalloc doesn't support data journalling,
+                        * but eventually maybe we'll lift this
+                        * restriction.
+                        */
+                       if (unlikely(journal_data && PageChecked(page)))
+                               err = __ext4_journalled_writepage(page, len);
+                       else
+                               err = ext4_bio_write_page(&io_submit, page,
+                                                         len, mpd->wbc);
+
+                       if (!err)
+                               mpd->pages_written++;
+                       /*
+                        * In error case, we have to continue because
+                        * remaining pages are still locked
+                        */
+                       if (ret == 0)
+                               ret = err;
                }
                pagevec_release(&pvec);
        }
+       ext4_io_submit(&io_submit);
+       return ret;
 }
 
-
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                                        sector_t logical, long blk_cnt)
 {
@@ -2187,35 +2193,32 @@ static void ext4_print_free_blocks(struct inode *inode)
 }
 
 /*
- * mpage_da_map_blocks - go through given space
+ * mpage_da_map_and_submit - go through given space, map them
+ *       if necessary, and then submit them for I/O
  *
  * @mpd - bh describing space
  *
  * The function skips space we know is already mapped to disk blocks.
  *
  */
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 {
        int err, blks, get_blocks_flags;
-       struct ext4_map_blocks map;
+       struct ext4_map_blocks map, *mapp = NULL;
        sector_t next = mpd->b_blocknr;
        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
        handle_t *handle = NULL;
 
        /*
-        * We consider only non-mapped and non-allocated blocks
-        */
-       if ((mpd->b_state  & (1 << BH_Mapped)) &&
-               !(mpd->b_state & (1 << BH_Delay)) &&
-               !(mpd->b_state & (1 << BH_Unwritten)))
-               return 0;
-
-       /*
-        * If we didn't accumulate anything to write simply return
+        * If the blocks are mapped already, or we couldn't accumulate
+        * any blocks, then proceed immediately to the submission stage.
         */
-       if (!mpd->b_size)
-               return 0;
+       if ((mpd->b_size == 0) ||
+           ((mpd->b_state  & (1 << BH_Mapped)) &&
+            !(mpd->b_state & (1 << BH_Delay)) &&
+            !(mpd->b_state & (1 << BH_Unwritten))))
+               goto submit_io;
 
        handle = ext4_journal_current_handle();
        BUG_ON(!handle);
@@ -2252,17 +2255,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 
                err = blks;
                /*
-                * If get block returns with error we simply
-                * return. Later writepage will redirty the page and
-                * writepages will find the dirty page again
+                * If get block returns EAGAIN or ENOSPC and there
+                * appears to be free blocks we will call
+                * ext4_writepage() for all of the pages which will
+                * just redirty the pages.
                 */
                if (err == -EAGAIN)
-                       return 0;
+                       goto submit_io;
 
                if (err == -ENOSPC &&
                    ext4_count_free_blocks(sb)) {
                        mpd->retval = err;
-                       return 0;
+                       goto submit_io;
                }
 
                /*
@@ -2287,10 +2291,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                /* invalidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
                                mpd->b_size >> mpd->inode->i_blkbits);
-               return err;
+               return;
        }
        BUG_ON(blks == 0);
 
+       mapp = &map;
        if (map.m_flags & EXT4_MAP_NEW) {
                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
                int i;
@@ -2299,18 +2304,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                        unmap_underlying_metadata(bdev, map.m_pblk + i);
        }
 
-       /*
-        * If blocks are delayed marked, we need to
-        * put actual blocknr and drop delayed bit
-        */
-       if ((mpd->b_state & (1 << BH_Delay)) ||
-           (mpd->b_state & (1 << BH_Unwritten)))
-               mpage_put_bnr_to_bhs(mpd, &map);
-
        if (ext4_should_order_data(mpd->inode)) {
                err = ext4_jbd2_file_inode(handle, mpd->inode);
                if (err)
-                       return err;
+                       /* This only happens if the journal is aborted */
+                       return;
        }
 
        /*
@@ -2321,10 +2319,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                disksize = i_size_read(mpd->inode);
        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
                ext4_update_i_disksize(mpd->inode, disksize);
-               return ext4_mark_inode_dirty(handle, mpd->inode);
+               err = ext4_mark_inode_dirty(handle, mpd->inode);
+               if (err)
+                       ext4_error(mpd->inode->i_sb,
+                                  "Failed to mark inode %lu dirty",
+                                  mpd->inode->i_ino);
        }
 
-       return 0;
+submit_io:
+       mpage_da_submit_io(mpd, mapp);
+       mpd->io_done = 1;
 }
 
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2401,9 +2405,7 @@ flush_it:
         * We couldn't merge the block to our extent, so we
         * need to flush current  extent and start new one
         */
-       if (mpage_da_map_blocks(mpd) == 0)
-               mpage_da_submit_io(mpd);
-       mpd->io_done = 1;
+       mpage_da_map_and_submit(mpd);
        return;
 }
 
@@ -2422,9 +2424,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
  * The function finds extents of pages and scan them for all blocks.
  */
 static int __mpage_da_writepage(struct page *page,
-                               struct writeback_control *wbc, void *data)
+                               struct writeback_control *wbc,
+                               struct mpage_da_data *mpd)
 {
-       struct mpage_da_data *mpd = data;
        struct inode *inode = mpd->inode;
        struct buffer_head *bh, *head;
        sector_t logical;
@@ -2435,15 +2437,13 @@ static int __mpage_da_writepage(struct page *page,
        if (mpd->next_page != page->index) {
                /*
                 * Nope, we can't. So, we map non-allocated blocks
-                * and start IO on them using writepage()
+                * and start IO on them
                 */
                if (mpd->next_page != mpd->first_page) {
-                       if (mpage_da_map_blocks(mpd) == 0)
-                               mpage_da_submit_io(mpd);
+                       mpage_da_map_and_submit(mpd);
                        /*
                         * skip rest of the page in the page_vec
                         */
-                       mpd->io_done = 1;
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return MPAGE_DA_EXTENT_TAIL;
@@ -2622,6 +2622,7 @@ static int __ext4_journalled_writepage(struct page *page,
        int ret = 0;
        int err;
 
+       ClearPageChecked(page);
        page_bufs = page_buffers(page);
        BUG_ON(!page_bufs);
        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2699,7 +2700,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int ext4_writepage(struct page *page,
                          struct writeback_control *wbc)
 {
-       int ret = 0;
+       int ret = 0, commit_write = 0;
        loff_t size;
        unsigned int len;
        struct buffer_head *page_bufs = NULL;
@@ -2712,71 +2713,46 @@ static int ext4_writepage(struct page *page,
        else
                len = PAGE_CACHE_SIZE;
 
-       if (page_has_buffers(page)) {
-               page_bufs = page_buffers(page);
-               if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                       ext4_bh_delay_or_unwritten)) {
-                       /*
-                        * We don't want to do  block allocation
-                        * So redirty the page and return
-                        * We may reach here when we do a journal commit
-                        * via journal_submit_inode_data_buffers.
-                        * If we don't have mapping block we just ignore
-                        * them. We can also reach here via shrink_page_list
-                        */
+       /*
+        * If the page does not have buffers (for whatever reason),
+        * try to create them using __block_write_begin.  If this
+        * fails, redirty the page and move on.
+        */
+       if (!page_buffers(page)) {
+               if (__block_write_begin(page, 0, len,
+                                       noalloc_get_block_write)) {
+               redirty_page:
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return 0;
                }
-       } else {
+               commit_write = 1;
+       }
+       page_bufs = page_buffers(page);
+       if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                             ext4_bh_delay_or_unwritten)) {
                /*
-                * The test for page_has_buffers() is subtle:
-                * We know the page is dirty but it lost buffers. That means
-                * that at some moment in time after write_begin()/write_end()
-                * has been called all buffers have been clean and thus they
-                * must have been written at least once. So they are all
-                * mapped and we can happily proceed with mapping them
-                * and writing the page.
-                *
-                * Try to initialize the buffer_heads and check whether
-                * all are mapped and non delay. We don't want to
-                * do block allocation here.
+                * We don't want to do block allocation So redirty the
+                * page and return We may reach here when we do a
+                * journal commit via
+                * journal_submit_inode_data_buffers.  If we don't
+                * have mapping block we just ignore them. We can also
+                * reach here via shrink_page_list
                 */
-               ret = __block_write_begin(page, 0, len,
-                                         noalloc_get_block_write);
-               if (!ret) {
-                       page_bufs = page_buffers(page);
-                       /* check whether all are mapped and non delay */
-                       if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                               ext4_bh_delay_or_unwritten)) {
-                               redirty_page_for_writepage(wbc, page);
-                               unlock_page(page);
-                               return 0;
-                       }
-               } else {
-                       /*
-                        * We can't do block allocation here
-                        * so just redity the page and unlock
-                        * and return
-                        */
-                       redirty_page_for_writepage(wbc, page);
-                       unlock_page(page);
-                       return 0;
-               }
+               goto redirty_page;
+       }
+       if (commit_write)
                /* now mark the buffer_heads as dirty and uptodate */
                block_commit_write(page, 0, len);
-       }
 
-       if (PageChecked(page) && ext4_should_journal_data(inode)) {
+       if (PageChecked(page) && ext4_should_journal_data(inode))
                /*
                 * It's mmapped pagecache.  Add buffers and journal it.  There
                 * doesn't seem much point in redirtying the page here.
                 */
-               ClearPageChecked(page);
                return __ext4_journalled_writepage(page, len);
-       }
 
-       if (page_bufs && buffer_uninit(page_bufs)) {
+       if (buffer_uninit(page_bufs)) {
                ext4_set_bh_endio(page_bufs, inode);
                ret = block_write_full_page_endio(page, noalloc_get_block_write,
                                            wbc, ext4_end_io_buffer_write);
@@ -2823,25 +2799,32 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
  */
 static int write_cache_pages_da(struct address_space *mapping,
                                struct writeback_control *wbc,
-                               struct mpage_da_data *mpd)
+                               struct mpage_da_data *mpd,
+                               pgoff_t *done_index)
 {
        int ret = 0;
        int done = 0;
        struct pagevec pvec;
-       int nr_pages;
+       unsigned nr_pages;
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        long nr_to_write = wbc->nr_to_write;
+       int tag;
 
        pagevec_init(&pvec, 0);
        index = wbc->range_start >> PAGE_CACHE_SHIFT;
        end = wbc->range_end >> PAGE_CACHE_SHIFT;
 
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag = PAGECACHE_TAG_TOWRITE;
+       else
+               tag = PAGECACHE_TAG_DIRTY;
+
+       *done_index = index;
        while (!done && (index <= end)) {
                int i;
 
-               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-                             PAGECACHE_TAG_DIRTY,
+               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
                        break;
@@ -2861,6 +2844,8 @@ static int write_cache_pages_da(struct address_space *mapping,
                                break;
                        }
 
+                       *done_index = page->index + 1;
+
                        lock_page(page);
 
                        /*
@@ -2946,6 +2931,8 @@ static int ext4_da_writepages(struct address_space *mapping,
        long desired_nr_to_write, nr_to_writebump = 0;
        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+       pgoff_t done_index = 0;
+       pgoff_t end;
 
        trace_ext4_da_writepages(inode, wbc);
 
@@ -2981,8 +2968,11 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->range_start = index << PAGE_CACHE_SHIFT;
                wbc->range_end  = LLONG_MAX;
                wbc->range_cyclic = 0;
-       } else
+               end = -1;
+       } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+               end = wbc->range_end >> PAGE_CACHE_SHIFT;
+       }
 
        /*
         * This works around two forms of stupidity.  The first is in
@@ -3001,9 +2991,12 @@ static int ext4_da_writepages(struct address_space *mapping,
         * sbi->max_writeback_mb_bump whichever is smaller.
         */
        max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-       if (!range_cyclic && range_whole)
-               desired_nr_to_write = wbc->nr_to_write * 8;
-       else
+       if (!range_cyclic && range_whole) {
+               if (wbc->nr_to_write == LONG_MAX)
+                       desired_nr_to_write = wbc->nr_to_write;
+               else
+                       desired_nr_to_write = wbc->nr_to_write * 8;
+       } else
                desired_nr_to_write = ext4_num_dirty_pages(inode, index,
                                                           max_pages);
        if (desired_nr_to_write > max_pages)
@@ -3020,6 +3013,9 @@ static int ext4_da_writepages(struct address_space *mapping,
        pages_skipped = wbc->pages_skipped;
 
 retry:
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag_pages_for_writeback(mapping, index, end);
+
        while (!ret && wbc->nr_to_write > 0) {
 
                /*
@@ -3058,16 +3054,14 @@ retry:
                mpd.io_done = 0;
                mpd.pages_written = 0;
                mpd.retval = 0;
-               ret = write_cache_pages_da(mapping, wbc, &mpd);
+               ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
                 * them for I/O.
                 */
                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-                       if (mpage_da_map_blocks(&mpd) == 0)
-                               mpage_da_submit_io(&mpd);
-                       mpd.io_done = 1;
+                       mpage_da_map_and_submit(&mpd);
                        ret = MPAGE_DA_EXTENT_TAIL;
                }
                trace_ext4_da_write_pages(inode, &mpd);
@@ -3114,14 +3108,13 @@ retry:
                         __func__, wbc->nr_to_write, ret);
 
        /* Update index */
-       index += pages_written;
        wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
-               mapping->writeback_index = index;
+               mapping->writeback_index = done_index;
 
 out_writepages:
        wbc->nr_to_write -= nr_to_writebump;
@@ -3456,15 +3449,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 
-static void ext4_free_io_end(ext4_io_end_t *io)
-{
-       BUG_ON(!io);
-       if (io->page)
-               put_page(io->page);
-       iput(io->inode);
-       kfree(io);
-}
-
 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
 {
        struct buffer_head *head, *bh;
@@ -3641,173 +3625,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef EXT4_DEBUG
-       struct list_head *cur, *before, *after;
-       ext4_io_end_t *io, *io0, *io1;
-       unsigned long flags;
-
-       if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-               ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
-               return;
-       }
-
-       ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-       list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
-               cur = &io->list;
-               before = cur->prev;
-               io0 = container_of(before, ext4_io_end_t, list);
-               after = cur->next;
-               io1 = container_of(after, ext4_io_end_t, list);
-
-               ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-                           io, inode->i_ino, io0, io1);
-       }
-       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-
-/*
- * check a range of space and convert unwritten extents to written.
- */
-static int ext4_end_io_nolock(ext4_io_end_t *io)
-{
-       struct inode *inode = io->inode;
-       loff_t offset = io->offset;
-       ssize_t size = io->size;
-       int ret = 0;
-
-       ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
-                  "list->prev 0x%p\n",
-                  io, inode->i_ino, io->list.next, io->list.prev);
-
-       if (list_empty(&io->list))
-               return ret;
-
-       if (io->flag != EXT4_IO_UNWRITTEN)
-               return ret;
-
-       ret = ext4_convert_unwritten_extents(inode, offset, size);
-       if (ret < 0) {
-               printk(KERN_EMERG "%s: failed to convert unwritten"
-                       "extents to written extents, error is %d"
-                       " io is still on inode %lu aio dio list\n",
-                       __func__, ret, inode->i_ino);
-               return ret;
-       }
-
-       if (io->iocb)
-               aio_complete(io->iocb, io->result, 0);
-       /* clear the DIO AIO unwritten flag */
-       io->flag = 0;
-       return ret;
-}
-
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
-{
-       ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
-       struct inode            *inode = io->inode;
-       struct ext4_inode_info  *ei = EXT4_I(inode);
-       unsigned long           flags;
-       int                     ret;
-
-       mutex_lock(&inode->i_mutex);
-       ret = ext4_end_io_nolock(io);
-       if (ret < 0) {
-               mutex_unlock(&inode->i_mutex);
-               return;
-       }
-
-       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       if (!list_empty(&io->list))
-               list_del_init(&io->list);
-       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-       mutex_unlock(&inode->i_mutex);
-       ext4_free_io_end(io);
-}
-
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int flush_completed_IO(struct inode *inode)
-{
-       ext4_io_end_t *io;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       unsigned long flags;
-       int ret = 0;
-       int ret2 = 0;
-
-       if (list_empty(&ei->i_completed_io_list))
-               return ret;
-
-       dump_completed_IO(inode);
-       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       while (!list_empty(&ei->i_completed_io_list)){
-               io = list_entry(ei->i_completed_io_list.next,
-                               ext4_io_end_t, list);
-               /*
-                * Calling ext4_end_io_nolock() to convert completed
-                * IO to written.
-                *
-                * When ext4_sync_file() is called, run_queue() may already
-                * about to flush the work corresponding to this io structure.
-                * It will be upset if it founds the io structure related
-                * to the work-to-be schedule is freed.
-                *
-                * Thus we need to keep the io structure still valid here after
-                * convertion finished. The io structure has a flag to
-                * avoid double converting from both fsync and background work
-                * queue work.
-                */
-               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-               ret = ext4_end_io_nolock(io);
-               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-               if (ret < 0)
-                       ret2 = ret;
-               else
-                       list_del_init(&io->list);
-       }
-       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-       return (ret2 < 0) ? ret2 : 0;
-}
-
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
-{
-       ext4_io_end_t *io = NULL;
-
-       io = kmalloc(sizeof(*io), flags);
-
-       if (io) {
-               igrab(inode);
-               io->inode = inode;
-               io->flag = 0;
-               io->offset = 0;
-               io->size = 0;
-               io->page = NULL;
-               io->iocb = NULL;
-               io->result = 0;
-               INIT_WORK(&io->work, ext4_end_io_work);
-               INIT_LIST_HEAD(&io->list);
-       }
-
-       return io;
-}
-
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                            ssize_t size, void *private, int ret,
                            bool is_async)
@@ -3827,7 +3644,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  size);
 
        /* if not aio dio with unwritten extents, just free io and return */
-       if (io_end->flag != EXT4_IO_UNWRITTEN){
+       if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                ext4_free_io_end(io_end);
                iocb->private = NULL;
 out:
@@ -3844,14 +3661,14 @@ out:
        }
        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
 
-       /* queue the work to convert unwritten extents to written */
-       queue_work(wq, &io_end->work);
-
        /* Add the io_end to per-inode completed aio dio list*/
        ei = EXT4_I(io_end->inode);
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        list_add_tail(&io_end->list, &ei->i_completed_io_list);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+       /* queue the work to convert unwritten extents to written */
+       queue_work(wq, &io_end->work);
        iocb->private = NULL;
 }
 
@@ -3872,7 +3689,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
                goto out;
        }
 
-       io_end->flag = EXT4_IO_UNWRITTEN;
+       io_end->flag = EXT4_IO_END_UNWRITTEN;
        inode = io_end->inode;
 
        /* Add the io_end to per-inode completed io list*/
@@ -5463,6 +5280,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        int error, rc = 0;
+       int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;
 
        error = inode_change_ok(inode, attr);
@@ -5518,8 +5336,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        error = PTR_ERR(handle);
                        goto err_out;
                }
-
-               error = ext4_orphan_add(handle, inode);
+               if (ext4_handle_valid(handle)) {
+                       error = ext4_orphan_add(handle, inode);
+                       orphan = 1;
+               }
                EXT4_I(inode)->i_disksize = attr->ia_size;
                rc = ext4_mark_inode_dirty(handle, inode);
                if (!error)
@@ -5537,6 +5357,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                        goto err_out;
                                }
                                ext4_orphan_del(handle, inode);
+                               orphan = 0;
                                ext4_journal_stop(handle);
                                goto err_out;
                        }
@@ -5559,7 +5380,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
         * If the call to ext4_truncate failed to get a transaction handle at
         * all, we need to clean up the in-core orphan list manually.
         */
-       if (inode->i_nlink)
+       if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);
 
        if (!rc && (ia_valid & ATTR_MODE))
@@ -5642,7 +5463,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  *
  * Also account for superblock, inode, quota and xattr blocks
  */
-int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
index 42f77b1..c58eba3 100644 (file)
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_ext_cachep;
+
+/* We create slab caches for groupinfo data structures based on the
+ * superblock block size.  There will be one per mounted filesystem for
+ * each unique s_blocksize_bits */
+#define NR_GRPINFO_CACHES      \
+       (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
+static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -938,6 +946,85 @@ out:
        return err;
 }
 
+/*
+ * lock the group_info alloc_sem of all the groups
+ * belonging to the same buddy cache page. This
+ * make sure other parallel operation on the buddy
+ * cache doesn't happen  whild holding the buddy cache
+ * lock
+ */
+static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
+                                       ext4_group_t group)
+{
+       int i;
+       int block, pnum;
+       int blocks_per_page;
+       int groups_per_page;
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
+       ext4_group_t first_group;
+       struct ext4_group_info *grp;
+
+       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+       /*
+        * the buddy cache inode stores the block bitmap
+        * and buddy information in consecutive blocks.
+        * So for each group we need two blocks.
+        */
+       block = group * 2;
+       pnum = block / blocks_per_page;
+       first_group = pnum * blocks_per_page / 2;
+
+       groups_per_page = blocks_per_page >> 1;
+       if (groups_per_page == 0)
+               groups_per_page = 1;
+       /* read all groups the page covers into the cache */
+       for (i = 0; i < groups_per_page; i++) {
+
+               if ((first_group + i) >= ngroups)
+                       break;
+               grp = ext4_get_group_info(sb, first_group + i);
+               /* take all groups write allocation
+                * semaphore. This make sure there is
+                * no block allocation going on in any
+                * of that groups
+                */
+               down_write_nested(&grp->alloc_sem, i);
+       }
+       return i;
+}
+
+static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+                                        ext4_group_t group, int locked_group)
+{
+       int i;
+       int block, pnum;
+       int blocks_per_page;
+       ext4_group_t first_group;
+       struct ext4_group_info *grp;
+
+       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+       /*
+        * the buddy cache inode stores the block bitmap
+        * and buddy information in consecutive blocks.
+        * So for each group we need two blocks.
+        */
+       block = group * 2;
+       pnum = block / blocks_per_page;
+       first_group = pnum * blocks_per_page / 2;
+       /* release locks on all the groups */
+       for (i = 0; i < locked_group; i++) {
+
+               grp = ext4_get_group_info(sb, first_group + i);
+               /* take all groups write allocation
+                * semaphore. This make sure there is
+                * no block allocation going on in any
+                * of that groups
+                */
+               up_write(&grp->alloc_sem);
+       }
+
+}
+
 /*
  * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
  * block group lock of all groups for this page; do not hold the BG lock when
@@ -1915,84 +2002,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        return 0;
 }
 
-/*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen  whild holding the buddy cache
- * lock
- */
-int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
-{
-       int i;
-       int block, pnum;
-       int blocks_per_page;
-       int groups_per_page;
-       ext4_group_t ngroups = ext4_get_groups_count(sb);
-       ext4_group_t first_group;
-       struct ext4_group_info *grp;
-
-       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-       /*
-        * the buddy cache inode stores the block bitmap
-        * and buddy information in consecutive blocks.
-        * So for each group we need two blocks.
-        */
-       block = group * 2;
-       pnum = block / blocks_per_page;
-       first_group = pnum * blocks_per_page / 2;
-
-       groups_per_page = blocks_per_page >> 1;
-       if (groups_per_page == 0)
-               groups_per_page = 1;
-       /* read all groups the page covers into the cache */
-       for (i = 0; i < groups_per_page; i++) {
-
-               if ((first_group + i) >= ngroups)
-                       break;
-               grp = ext4_get_group_info(sb, first_group + i);
-               /* take all groups write allocation
-                * semaphore. This make sure there is
-                * no block allocation going on in any
-                * of that groups
-                */
-               down_write_nested(&grp->alloc_sem, i);
-       }
-       return i;
-}
-
-void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
-                                       ext4_group_t group, int locked_group)
-{
-       int i;
-       int block, pnum;
-       int blocks_per_page;
-       ext4_group_t first_group;
-       struct ext4_group_info *grp;
-
-       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-       /*
-        * the buddy cache inode stores the block bitmap
-        * and buddy information in consecutive blocks.
-        * So for each group we need two blocks.
-        */
-       block = group * 2;
-       pnum = block / blocks_per_page;
-       first_group = pnum * blocks_per_page / 2;
-       /* release locks on all the groups */
-       for (i = 0; i < locked_group; i++) {
-
-               grp = ext4_get_group_info(sb, first_group + i);
-               /* take all groups write allocation
-                * semaphore. This make sure there is
-                * no block allocation going on in any
-                * of that groups
-                */
-               up_write(&grp->alloc_sem);
-       }
-
-}
-
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -2233,15 +2242,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
        .release        = seq_release,
 };
 
+static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+{
+       int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+       struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+
+       BUG_ON(!cachep);
+       return cachep;
+}
 
 /* Create and initialize ext4_group_info data for the given group. */
 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                          struct ext4_group_desc *desc)
 {
-       int i, len;
+       int i;
        int metalen = 0;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info **meta_group_info;
+       struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
        /*
         * First check if this group is the first of a reserved block.
@@ -2261,22 +2279,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                        meta_group_info;
        }
 
-       /*
-        * calculate needed size. if change bb_counters size,
-        * don't forget about ext4_mb_generate_buddy()
-        */
-       len = offsetof(typeof(**meta_group_info),
-                      bb_counters[sb->s_blocksize_bits + 2]);
-
        meta_group_info =
                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
 
-       meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+       meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
        if (meta_group_info[i] == NULL) {
                printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
                goto exit_group_info;
        }
+       memset(meta_group_info[i], 0, kmem_cache_size(cachep));
        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
                &(meta_group_info[i]->bb_state));
 
@@ -2331,6 +2343,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        int num_meta_group_infos_max;
        int array_size;
        struct ext4_group_desc *desc;
+       struct kmem_cache *cachep;
 
        /* This is the number of blocks used by GDT */
        num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
@@ -2389,8 +2402,9 @@ static int ext4_mb_init_backend(struct super_block *sb)
        return 0;
 
 err_freebuddy:
+       cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        while (i-- > 0)
-               kfree(ext4_get_group_info(sb, i));
+               kmem_cache_free(cachep, ext4_get_group_info(sb, i));
        i = num_meta_group_infos;
        while (i-- > 0)
                kfree(sbi->s_group_info[i]);
@@ -2407,19 +2421,48 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        unsigned offset;
        unsigned max;
        int ret;
+       int cache_index;
+       struct kmem_cache *cachep;
+       char *namep = NULL;
 
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
 
        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_offsets == NULL) {
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
        }
 
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
-               kfree(sbi->s_mb_offsets);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+       cachep = ext4_groupinfo_caches[cache_index];
+       if (!cachep) {
+               char name[32];
+               int len = offsetof(struct ext4_group_info,
+                                       bb_counters[sb->s_blocksize_bits + 2]);
+
+               sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
+               namep = kstrdup(name, GFP_KERNEL);
+               if (!namep) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+
+               /* Need to free the kmem_cache_name() when we
+                * destroy the slab */
+               cachep = kmem_cache_create(namep, len, 0,
+                                            SLAB_RECLAIM_ACCOUNT, NULL);
+               if (!cachep) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               ext4_groupinfo_caches[cache_index] = cachep;
        }
 
        /* order 0 is regular bitmap */
@@ -2440,9 +2483,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        /* init file for buddy data */
        ret = ext4_mb_init_backend(sb);
        if (ret != 0) {
-               kfree(sbi->s_mb_offsets);
-               kfree(sbi->s_mb_maxs);
-               return ret;
+               goto out;
        }
 
        spin_lock_init(&sbi->s_md_lock);
@@ -2457,9 +2498,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 
        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
        if (sbi->s_locality_groups == NULL) {
-               kfree(sbi->s_mb_offsets);
-               kfree(sbi->s_mb_maxs);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
        }
        for_each_possible_cpu(i) {
                struct ext4_locality_group *lg;
@@ -2476,7 +2516,13 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 
        if (sbi->s_journal)
                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-       return 0;
+out:
+       if (ret) {
+               kfree(sbi->s_mb_offsets);
+               kfree(sbi->s_mb_maxs);
+               kfree(namep);
+       }
+       return ret;
 }
 
 /* need to called with the ext4 group lock held */
@@ -2504,6 +2550,7 @@ int ext4_mb_release(struct super_block *sb)
        int num_meta_group_infos;
        struct ext4_group_info *grinfo;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
        if (sbi->s_group_info) {
                for (i = 0; i < ngroups; i++) {
@@ -2514,7 +2561,7 @@ int ext4_mb_release(struct super_block *sb)
                        ext4_lock_group(sb, i);
                        ext4_mb_cleanup_pa(grinfo);
                        ext4_unlock_group(sb, i);
-                       kfree(grinfo);
+                       kmem_cache_free(cachep, grinfo);
                }
                num_meta_group_infos = (ngroups +
                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
@@ -2558,7 +2605,7 @@ int ext4_mb_release(struct super_block *sb)
        return 0;
 }
 
-static inline void ext4_issue_discard(struct super_block *sb,
+static inline int ext4_issue_discard(struct super_block *sb,
                ext4_group_t block_group, ext4_grpblk_t block, int count)
 {
        int ret;
@@ -2568,10 +2615,11 @@ static inline void ext4_issue_discard(struct super_block *sb,
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);
        ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
-       if (ret == EOPNOTSUPP) {
+       if (ret == -EOPNOTSUPP) {
                ext4_warning(sb, "discard not supported, disabling");
                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
        }
+       return ret;
 }
 
 /*
@@ -2659,28 +2707,22 @@ static void ext4_remove_debugfs_entry(void)
 
 #endif
 
-int __init init_ext4_mballoc(void)
+int __init ext4_init_mballoc(void)
 {
-       ext4_pspace_cachep =
-               kmem_cache_create("ext4_prealloc_space",
-                                    sizeof(struct ext4_prealloc_space),
-                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
+       ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
+                                       SLAB_RECLAIM_ACCOUNT);
        if (ext4_pspace_cachep == NULL)
                return -ENOMEM;
 
-       ext4_ac_cachep =
-               kmem_cache_create("ext4_alloc_context",
-                                    sizeof(struct ext4_allocation_context),
-                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
+       ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
+                                   SLAB_RECLAIM_ACCOUNT);
        if (ext4_ac_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                return -ENOMEM;
        }
 
-       ext4_free_ext_cachep =
-               kmem_cache_create("ext4_free_block_extents",
-                                    sizeof(struct ext4_free_data),
-                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
+       ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
+                                         SLAB_RECLAIM_ACCOUNT);
        if (ext4_free_ext_cachep == NULL) {
                kmem_cache_destroy(ext4_pspace_cachep);
                kmem_cache_destroy(ext4_ac_cachep);
@@ -2690,8 +2732,9 @@ int __init init_ext4_mballoc(void)
        return 0;
 }
 
-void exit_ext4_mballoc(void)
+void ext4_exit_mballoc(void)
 {
+       int i;
        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
@@ -2700,6 +2743,15 @@ void exit_ext4_mballoc(void)
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_ext_cachep);
+
+       for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+               struct kmem_cache *cachep = ext4_groupinfo_caches[i];
+               if (cachep) {
+                       char *name = (char *)kmem_cache_name(cachep);
+                       kmem_cache_destroy(cachep);
+                       kfree(name);
+               }
+       }
        ext4_remove_debugfs_entry();
 }
 
@@ -3536,8 +3588,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
  */
 static noinline_for_stack int
 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
-                       struct ext4_prealloc_space *pa,
-                       struct ext4_allocation_context *ac)
+                       struct ext4_prealloc_space *pa)
 {
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3555,11 +3606,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;
 
-       if (ac) {
-               ac->ac_sb = sb;
-               ac->ac_inode = pa->pa_inode;
-       }
-
        while (bit < end) {
                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
                if (bit >= end)
@@ -3570,16 +3616,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                         (unsigned) next - bit, (unsigned) group);
                free += next - bit;
 
-               if (ac) {
-                       ac->ac_b_ex.fe_group = group;
-                       ac->ac_b_ex.fe_start = bit;
-                       ac->ac_b_ex.fe_len = next - bit;
-                       ac->ac_b_ex.fe_logical = 0;
-                       trace_ext4_mballoc_discard(ac);
-               }
-
-               trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
-                                              next - bit);
+               trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
+               trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
+                                              grp_blk_start + bit, next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
@@ -3602,29 +3641,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 
 static noinline_for_stack int
 ext4_mb_release_group_pa(struct ext4_buddy *e4b,
-                               struct ext4_prealloc_space *pa,
-                               struct ext4_allocation_context *ac)
+                               struct ext4_prealloc_space *pa)
 {
        struct super_block *sb = e4b->bd_sb;
        ext4_group_t group;
        ext4_grpblk_t bit;
 
-       trace_ext4_mb_release_group_pa(sb, ac, pa);
+       trace_ext4_mb_release_group_pa(sb, pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
-
-       if (ac) {
-               ac->ac_sb = sb;
-               ac->ac_inode = NULL;
-               ac->ac_b_ex.fe_group = group;
-               ac->ac_b_ex.fe_start = bit;
-               ac->ac_b_ex.fe_len = pa->pa_len;
-               ac->ac_b_ex.fe_logical = 0;
-               trace_ext4_mballoc_discard(ac);
-       }
+       trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
 
        return 0;
 }
@@ -3645,7 +3674,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
-       struct ext4_allocation_context *ac;
        struct list_head list;
        struct ext4_buddy e4b;
        int err;
@@ -3674,9 +3702,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
                needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
 
        INIT_LIST_HEAD(&list);
-       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-       if (ac)
-               ac->ac_sb = sb;
 repeat:
        ext4_lock_group(sb, group);
        list_for_each_entry_safe(pa, tmp,
@@ -3731,9 +3756,9 @@ repeat:
                spin_unlock(pa->pa_obj_lock);
 
                if (pa->pa_type == MB_GROUP_PA)
-                       ext4_mb_release_group_pa(&e4b, pa, ac);
+                       ext4_mb_release_group_pa(&e4b, pa);
                else
-                       ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+                       ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
 
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3741,8 +3766,6 @@ repeat:
 
 out:
        ext4_unlock_group(sb, group);
-       if (ac)
-               kmem_cache_free(ext4_ac_cachep, ac);
        ext4_mb_unload_buddy(&e4b);
        put_bh(bitmap_bh);
        return free;
@@ -3763,7 +3786,6 @@ void ext4_discard_preallocations(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
-       struct ext4_allocation_context *ac;
        ext4_group_t group = 0;
        struct list_head list;
        struct ext4_buddy e4b;
@@ -3779,11 +3801,6 @@ void ext4_discard_preallocations(struct inode *inode)
 
        INIT_LIST_HEAD(&list);
 
-       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-       if (ac) {
-               ac->ac_sb = sb;
-               ac->ac_inode = inode;
-       }
 repeat:
        /* first, collect all pa's in the inode */
        spin_lock(&ei->i_prealloc_lock);
@@ -3853,7 +3870,7 @@ repeat:
 
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
-               ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+               ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                ext4_unlock_group(sb, group);
 
                ext4_mb_unload_buddy(&e4b);
@@ -3862,8 +3879,6 @@ repeat:
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
-       if (ac)
-               kmem_cache_free(ext4_ac_cachep, ac);
 }
 
 /*
@@ -4061,14 +4076,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
        struct ext4_buddy e4b;
        struct list_head discard_list;
        struct ext4_prealloc_space *pa, *tmp;
-       struct ext4_allocation_context *ac;
 
        mb_debug(1, "discard locality group preallocation\n");
 
        INIT_LIST_HEAD(&discard_list);
-       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-       if (ac)
-               ac->ac_sb = sb;
 
        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4120,15 +4131,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                }
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
-               ext4_mb_release_group_pa(&e4b, pa, ac);
+               ext4_mb_release_group_pa(&e4b, pa);
                ext4_unlock_group(sb, group);
 
                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
-       if (ac)
-               kmem_cache_free(ext4_ac_cachep, ac);
 }
 
 /*
@@ -4492,7 +4501,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 {
        struct buffer_head *bitmap_bh = NULL;
        struct super_block *sb = inode->i_sb;
-       struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
        unsigned long freed = 0;
        unsigned int overflow;
@@ -4532,6 +4540,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        if (!bh)
                                tbh = sb_find_get_block(inode->i_sb,
                                                        block + i);
+                       if (unlikely(!tbh))
+                               continue;
                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                    inode, tbh, block + i);
                }
@@ -4547,12 +4557,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        if (!ext4_should_writeback_data(inode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
 
-       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-       if (ac) {
-               ac->ac_inode = inode;
-               ac->ac_sb = sb;
-       }
-
 do_more:
        overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4610,12 +4614,7 @@ do_more:
                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
        }
 #endif
-       if (ac) {
-               ac->ac_b_ex.fe_group = block_group;
-               ac->ac_b_ex.fe_start = bit;
-               ac->ac_b_ex.fe_len = count;
-               trace_ext4_mballoc_free(ac);
-       }
+       trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
 
        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
@@ -4641,12 +4640,12 @@ do_more:
                 * with group lock held. generate_buddy look at
                 * them with group lock_held
                 */
+               if (test_opt(sb, DISCARD))
+                       ext4_issue_discard(sb, block_group, bit, count);
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-               if (test_opt(sb, DISCARD))
-                       ext4_issue_discard(sb, block_group, bit, count);
        }
 
        ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4686,7 +4685,190 @@ error_return:
                dquot_free_block(inode, freed);
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
-       if (ac)
-               kmem_cache_free(ext4_ac_cachep, ac);
        return;
 }
+
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb:                super block for the file system
+ * @start:     starting block of the free extent in the alloc. group
+ * @count:     number of blocks to TRIM
+ * @group:     alloc. group we are working with
+ * @e4b:       ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
+               ext4_group_t group, struct ext4_buddy *e4b)
+{
+       struct ext4_free_extent ex;
+       int ret = 0;
+
+       assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+       ex.fe_start = start;
+       ex.fe_group = group;
+       ex.fe_len = count;
+
+       /*
+        * Mark blocks used, so no one can reuse them while
+        * being trimmed.
+        */
+       mb_mark_used(e4b, &ex);
+       ext4_unlock_group(sb, group);
+
+       ret = ext4_issue_discard(sb, group, start, count);
+       if (ret)
+               ext4_std_error(sb, ret);
+
+       ext4_lock_group(sb, group);
+       mb_free_blocks(NULL, e4b, start, ex.fe_len);
+       return ret;
+}
+
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:                        super block for file system
+ * @e4b:               ext4 buddy
+ * @start:             first group block to examine
+ * @max:               last group block to examine
+ * @minblocks:         minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+               ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+{
+       void *bitmap;
+       ext4_grpblk_t next, count = 0;
+       ext4_group_t group;
+       int ret = 0;
+
+       BUG_ON(e4b == NULL);
+
+       bitmap = e4b->bd_bitmap;
+       group = e4b->bd_group;
+       start = (e4b->bd_info->bb_first_free > start) ?
+               e4b->bd_info->bb_first_free : start;
+       ext4_lock_group(sb, group);
+
+       while (start < max) {
+               start = mb_find_next_zero_bit(bitmap, max, start);
+               if (start >= max)
+                       break;
+               next = mb_find_next_bit(bitmap, max, start);
+
+               if ((next - start) >= minblocks) {
+                       ret = ext4_trim_extent(sb, start,
+                               next - start, group, e4b);
+                       if (ret < 0)
+                               break;
+                       count += next - start;
+               }
+               start = next + 1;
+
+               if (fatal_signal_pending(current)) {
+                       count = -ERESTARTSYS;
+                       break;
+               }
+
+               if (need_resched()) {
+                       ext4_unlock_group(sb, group);
+                       cond_resched();
+                       ext4_lock_group(sb, group);
+               }
+
+               if ((e4b->bd_info->bb_free - count) < minblocks)
+                       break;
+       }
+       ext4_unlock_group(sb, group);
+
+       ext4_debug("trimmed %d blocks in the group %d\n",
+               count, group);
+
+       if (ret < 0)
+               count = ret;
+
+       return count;
+}
+
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb:                        superblock for filesystem
+ * @range:             fstrim_range structure
+ *
+ * start:      First Byte to trim
+ * len:                number of Bytes to trim from start
+ * minlen:     minimum extent length in Bytes
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+       struct ext4_buddy e4b;
+       ext4_group_t first_group, last_group;
+       ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+       ext4_grpblk_t cnt = 0, first_block, last_block;
+       uint64_t start, len, minlen, trimmed;
+       int ret = 0;
+
+       start = range->start >> sb->s_blocksize_bits;
+       len = range->len >> sb->s_blocksize_bits;
+       minlen = range->minlen >> sb->s_blocksize_bits;
+       trimmed = 0;
+
+       if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+               return -EINVAL;
+
+       /* Determine first and last group to examine based on start and len */
+       ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+                                    &first_group, &first_block);
+       ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+                                    &last_group, &last_block);
+       last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+       last_block = EXT4_BLOCKS_PER_GROUP(sb);
+
+       if (first_group > last_group)
+               return -EINVAL;
+
+       for (group = first_group; group <= last_group; group++) {
+               ret = ext4_mb_load_buddy(sb, group, &e4b);
+               if (ret) {
+                       ext4_error(sb, "Error in loading buddy "
+                                       "information for %u", group);
+                       break;
+               }
+
+               if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+                       len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+               else
+                       last_block = len;
+
+               if (e4b.bd_info->bb_free >= minlen) {
+                       cnt = ext4_trim_all_free(sb, &e4b, first_block,
+                                               last_block, minlen);
+                       if (cnt < 0) {
+                               ret = cnt;
+                               ext4_mb_unload_buddy(&e4b);
+                               break;
+                       }
+               }
+               ext4_mb_unload_buddy(&e4b);
+               trimmed += cnt;
+               first_block = 0;
+       }
+       range->len = trimmed * sb->s_blocksize;
+
+       return ret;
+}
index 1765c2c..25f3a97 100644 (file)
@@ -412,7 +412,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
        struct buffer_head *bh;
        struct ext4_extent_header *eh;
 
-       block = idx_pblock(ix);
+       block = ext4_idx_pblock(ix);
        bh = sb_bread(inode->i_sb, block);
        if (!bh)
                return -EIO;
index 5f1ed9f..b9f3e78 100644 (file)
@@ -85,7 +85,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
        if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
                /* leaf block */
                *extent = ++path[ppos].p_ext;
-               path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+               path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
                return 0;
        }
 
@@ -96,7 +96,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 
                        /* index block */
                        path[ppos].p_idx++;
-                       path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+                       path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                        if (path[ppos+1].p_bh)
                                brelse(path[ppos+1].p_bh);
                        path[ppos+1].p_bh =
@@ -111,7 +111,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                                path[cur_ppos].p_idx =
                                        EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
                                path[cur_ppos].p_block =
-                                       idx_pblock(path[cur_ppos].p_idx);
+                                       ext4_idx_pblock(path[cur_ppos].p_idx);
                                if (path[cur_ppos+1].p_bh)
                                        brelse(path[cur_ppos+1].p_bh);
                                path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
@@ -133,7 +133,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                        path[leaf_ppos].p_ext = *extent =
                                EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
                        path[leaf_ppos].p_block =
-                                       ext_pblock(path[leaf_ppos].p_ext);
+                                       ext4_ext_pblock(path[leaf_ppos].p_ext);
                        return 0;
                }
        }
@@ -249,7 +249,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                         */
                        o_end->ee_block = end_ext->ee_block;
                        o_end->ee_len = end_ext->ee_len;
-                       ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+                       ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
                }
 
                o_start->ee_len = start_ext->ee_len;
@@ -276,7 +276,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
                 */
                o_end->ee_block = end_ext->ee_block;
                o_end->ee_len = end_ext->ee_len;
-               ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+               ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
 
                /*
                 * Set 0 to the extent block if new_ext was
@@ -361,7 +361,7 @@ mext_insert_inside_block(struct ext4_extent *o_start,
        /* Insert new entry */
        if (new_ext->ee_len) {
                o_start[i] = *new_ext;
-               ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+               ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
        }
 
        /* Insert end entry */
@@ -488,7 +488,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        start_ext.ee_len = end_ext.ee_len = 0;
 
        new_ext.ee_block = cpu_to_le32(*from);
-       ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+       ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
        new_ext.ee_len = dext->ee_len;
        new_ext_alen = ext4_ext_get_actual_len(&new_ext);
        new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
@@ -553,7 +553,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
                copy_extent_status(oext, &end_ext);
                end_ext_alen = ext4_ext_get_actual_len(&end_ext);
                ext4_ext_store_pblock(&end_ext,
-                       (ext_pblock(o_end) + oext_alen - end_ext_alen));
+                       (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
                end_ext.ee_block =
                        cpu_to_le32(le32_to_cpu(o_end->ee_block) +
                        oext_alen - end_ext_alen);
@@ -604,7 +604,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
        /* When tmp_dext is too large, pick up the target range. */
        diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
 
-       ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
+       ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
        tmp_dext->ee_block =
                        cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
        tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
@@ -613,7 +613,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
                tmp_dext->ee_len = cpu_to_le16(max_count);
 
        orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
-       ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
+       ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
 
        /* Adjust extent length if donor extent is larger than orig */
        if (ext4_ext_get_actual_len(tmp_dext) >
index bd39885..92203b8 100644 (file)
@@ -856,6 +856,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        struct buffer_head *bh_use[NAMEI_RA_SIZE];
        struct buffer_head *bh, *ret = NULL;
        ext4_lblk_t start, block, b;
+       const u8 *name = d_name->name;
        int ra_max = 0;         /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        int ra_ptr = 0;         /* Current index into readahead
@@ -870,6 +871,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        namelen = d_name->len;
        if (namelen > EXT4_NAME_LEN)
                return NULL;
+       if ((namelen <= 2) && (name[0] == '.') &&
+           (name[1] == '.' || name[1] == '0')) {
+               /*
+                * "." or ".." will only be in the first block
+                * NFS may look up ".."; "." should be handled by the VFS
+                */
+               block = start = 0;
+               nblocks = 1;
+               goto restart;
+       }
        if (is_dx(dir)) {
                bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
                /*
@@ -960,55 +971,35 @@ cleanup_and_exit:
 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
                       struct ext4_dir_entry_2 **res_dir, int *err)
 {
-       struct super_block * sb;
+       struct super_block * sb = dir->i_sb;
        struct dx_hash_info     hinfo;
-       u32 hash;
        struct dx_frame frames[2], *frame;
-       struct ext4_dir_entry_2 *de, *top;
        struct buffer_head *bh;
        ext4_lblk_t block;
        int retval;
-       int namelen = d_name->len;
-       const u8 *name = d_name->name;
 
-       sb = dir->i_sb;
-       /* NFS may look up ".." - look at dx_root directory block */
-       if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-               if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-                       return NULL;
-       } else {
-               frame = frames;
-               frame->bh = NULL;                       /* for dx_release() */
-               frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
-               dx_set_block(frame->at, 0);             /* dx_root block is 0 */
-       }
-       hash = hinfo.hash;
+       if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
+               return NULL;
        do {
                block = dx_get_block(frame->at);
-               if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+               if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
                        goto errout;
-               de = (struct ext4_dir_entry_2 *) bh->b_data;
-               top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-                                      EXT4_DIR_REC_LEN(0));
-               for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
-                       int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
-                                 + ((char *) de - bh->b_data);
-
-                       if (!ext4_check_dir_entry(dir, de, bh, off)) {
-                               brelse(bh);
-                               *err = ERR_BAD_DX_DIR;
-                               goto errout;
-                       }
 
-                       if (ext4_match(namelen, name, de)) {
-                               *res_dir = de;
-                               dx_release(frames);
-                               return bh;
-                       }
+               retval = search_dirblock(bh, dir, d_name,
+                                        block << EXT4_BLOCK_SIZE_BITS(sb),
+                                        res_dir);
+               if (retval == 1) {      /* Success! */
+                       dx_release(frames);
+                       return bh;
                }
                brelse(bh);
+               if (retval == -1) {
+                       *err = ERR_BAD_DX_DIR;
+                       goto errout;
+               }
+
                /* Check to see if we should continue to search */
-               retval = ext4_htree_next_block(dir, hash, frame,
+               retval = ext4_htree_next_block(dir, hinfo.hash, frame,
                                               frames, NULL);
                if (retval < 0) {
                        ext4_warning(sb,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644 (file)
index 0000000..46a7d6a
--- /dev/null
@@ -0,0 +1,430 @@
+/*
+ * linux/fs/ext4/page-io.c
+ *
+ * This contains the new page_io functions for ext4
+ *
+ * Written by Theodore Ts'o, 2010.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "ext4_extents.h"
+
+static struct kmem_cache *io_page_cachep, *io_end_cachep;
+
+int __init ext4_init_pageio(void)
+{
+       io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
+       if (io_page_cachep == NULL)
+               return -ENOMEM;
+       io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
+       if (io_page_cachep == NULL) {
+               kmem_cache_destroy(io_page_cachep);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+void ext4_exit_pageio(void)
+{
+       kmem_cache_destroy(io_end_cachep);
+       kmem_cache_destroy(io_page_cachep);
+}
+
+void ext4_free_io_end(ext4_io_end_t *io)
+{
+       int i;
+
+       BUG_ON(!io);
+       if (io->page)
+               put_page(io->page);
+       for (i = 0; i < io->num_io_pages; i++) {
+               if (--io->pages[i]->p_count == 0) {
+                       struct page *page = io->pages[i]->p_page;
+
+                       end_page_writeback(page);
+                       put_page(page);
+                       kmem_cache_free(io_page_cachep, io->pages[i]);
+               }
+       }
+       io->num_io_pages = 0;
+       iput(io->inode);
+       kmem_cache_free(io_end_cachep, io);
+}
+
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+int ext4_end_io_nolock(ext4_io_end_t *io)
+{
+       struct inode *inode = io->inode;
+       loff_t offset = io->offset;
+       ssize_t size = io->size;
+       int ret = 0;
+
+       ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+                  "list->prev 0x%p\n",
+                  io, inode->i_ino, io->list.next, io->list.prev);
+
+       if (list_empty(&io->list))
+               return ret;
+
+       if (!(io->flag & EXT4_IO_END_UNWRITTEN))
+               return ret;
+
+       ret = ext4_convert_unwritten_extents(inode, offset, size);
+       if (ret < 0) {
+               printk(KERN_EMERG "%s: failed to convert unwritten "
+                       "extents to written extents, error is %d "
+                       "io is still on inode %lu aio dio list\n",
+                      __func__, ret, inode->i_ino);
+               return ret;
+       }
+
+       if (io->iocb)
+               aio_complete(io->iocb, io->result, 0);
+       /* clear the DIO AIO unwritten flag */
+       io->flag &= ~EXT4_IO_END_UNWRITTEN;
+       return ret;
+}
+
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+       ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
+       struct inode            *inode = io->inode;
+       struct ext4_inode_info  *ei = EXT4_I(inode);
+       unsigned long           flags;
+       int                     ret;
+
+       mutex_lock(&inode->i_mutex);
+       ret = ext4_end_io_nolock(io);
+       if (ret < 0) {
+               mutex_unlock(&inode->i_mutex);
+               return;
+       }
+
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+       if (!list_empty(&io->list))
+               list_del_init(&io->list);
+       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+       mutex_unlock(&inode->i_mutex);
+       ext4_free_io_end(io);
+}
+
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+       ext4_io_end_t *io = NULL;
+
+       io = kmem_cache_alloc(io_end_cachep, flags);
+       if (io) {
+               memset(io, 0, sizeof(*io));
+               io->inode = igrab(inode);
+               BUG_ON(!io->inode);
+               INIT_WORK(&io->work, ext4_end_io_work);
+               INIT_LIST_HEAD(&io->list);
+       }
+       return io;
+}
+
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+       char b[BDEVNAME_SIZE];
+       printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+                       bdevname(bh->b_bdev, b),
+                       (unsigned long long)bh->b_blocknr);
+}
+
+static void ext4_end_bio(struct bio *bio, int error)
+{
+       ext4_io_end_t *io_end = bio->bi_private;
+       struct workqueue_struct *wq;
+       struct inode *inode;
+       unsigned long flags;
+       ext4_fsblk_t err_block;
+       int i;
+
+       BUG_ON(!io_end);
+       inode = io_end->inode;
+       bio->bi_private = NULL;
+       bio->bi_end_io = NULL;
+       if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+               error = 0;
+       err_block = bio->bi_sector >> (inode->i_blkbits - 9);
+       bio_put(bio);
+
+       if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
+               pr_err("sb umounted, discard end_io request for inode %lu\n",
+                       io_end->inode->i_ino);
+               ext4_free_io_end(io_end);
+               return;
+       }
+
+       if (error) {
+               io_end->flag |= EXT4_IO_END_ERROR;
+               ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+                            "(offset %llu size %ld starting block %llu)",
+                            inode->i_ino,
+                            (unsigned long long) io_end->offset,
+                            (long) io_end->size,
+                            (unsigned long long) err_block);
+       }
+
+       for (i = 0; i < io_end->num_io_pages; i++) {
+               struct page *page = io_end->pages[i]->p_page;
+               struct buffer_head *bh, *head;
+               int partial_write = 0;
+
+               head = page_buffers(page);
+               if (error)
+                       SetPageError(page);
+               BUG_ON(!head);
+               if (head->b_size == PAGE_CACHE_SIZE)
+                       clear_buffer_dirty(head);
+               else {
+                       loff_t offset;
+                       loff_t io_end_offset = io_end->offset + io_end->size;
+
+                       offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
+                       bh = head;
+                       do {
+                               if ((offset >= io_end->offset) &&
+                                   (offset+bh->b_size <= io_end_offset)) {
+                                       if (error)
+                                               buffer_io_error(bh);
+
+                                       clear_buffer_dirty(bh);
+                               }
+                               if (buffer_delay(bh))
+                                       partial_write = 1;
+                               else if (!buffer_mapped(bh))
+                                       clear_buffer_dirty(bh);
+                               else if (buffer_dirty(bh))
+                                       partial_write = 1;
+                               offset += bh->b_size;
+                               bh = bh->b_this_page;
+                       } while (bh != head);
+               }
+
+               if (--io_end->pages[i]->p_count == 0) {
+                       struct page *page = io_end->pages[i]->p_page;
+
+                       end_page_writeback(page);
+                       put_page(page);
+                       kmem_cache_free(io_page_cachep, io_end->pages[i]);
+               }
+
+               /*
+                * If this is a partial write which happened to make
+                * all buffers uptodate then we can optimize away a
+                * bogus readpage() for the next read(). Here we
+                * 'discover' whether the page went uptodate as a
+                * result of this (potentially partial) write.
+                */
+               if (!partial_write)
+                       SetPageUptodate(page);
+       }
+
+       io_end->num_io_pages = 0;
+
+       /* Add the io_end to per-inode completed io list*/
+       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+       list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+
+       wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+       /* queue the work to convert unwritten extents to written */
+       queue_work(wq, &io_end->work);
+}
+
+void ext4_io_submit(struct ext4_io_submit *io)
+{
+       struct bio *bio = io->io_bio;
+
+       if (bio) {
+               bio_get(io->io_bio);
+               submit_bio(io->io_op, io->io_bio);
+               BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+               bio_put(io->io_bio);
+       }
+       io->io_bio = 0;
+       io->io_op = 0;
+       io->io_end = 0;
+}
+
+static int io_submit_init(struct ext4_io_submit *io,
+                         struct inode *inode,
+                         struct writeback_control *wbc,
+                         struct buffer_head *bh)
+{
+       ext4_io_end_t *io_end;
+       struct page *page = bh->b_page;
+       int nvecs = bio_get_nr_vecs(bh->b_bdev);
+       struct bio *bio;
+
+       io_end = ext4_init_io_end(inode, GFP_NOFS);
+       if (!io_end)
+               return -ENOMEM;
+       do {
+               bio = bio_alloc(GFP_NOIO, nvecs);
+               nvecs >>= 1;
+       } while (bio == NULL);
+
+       bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+       bio->bi_bdev = bh->b_bdev;
+       bio->bi_private = io->io_end = io_end;
+       bio->bi_end_io = ext4_end_bio;
+
+       io_end->inode = inode;
+       io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
+
+       io->io_bio = bio;
+       io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
+                       WRITE_SYNC_PLUG : WRITE);
+       io->io_next_block = bh->b_blocknr;
+       return 0;
+}
+
+static int io_submit_add_bh(struct ext4_io_submit *io,
+                           struct ext4_io_page *io_page,
+                           struct inode *inode,
+                           struct writeback_control *wbc,
+                           struct buffer_head *bh)
+{
+       ext4_io_end_t *io_end;
+       int ret;
+
+       if (buffer_new(bh)) {
+               clear_buffer_new(bh);
+               unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+       }
+
+       if (!buffer_mapped(bh) || buffer_delay(bh)) {
+               if (!buffer_mapped(bh))
+                       clear_buffer_dirty(bh);
+               if (io->io_bio)
+                       ext4_io_submit(io);
+               return 0;
+       }
+
+       if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+submit_and_retry:
+               ext4_io_submit(io);
+       }
+       if (io->io_bio == NULL) {
+               ret = io_submit_init(io, inode, wbc, bh);
+               if (ret)
+                       return ret;
+       }
+       io_end = io->io_end;
+       if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
+           (io_end->pages[io_end->num_io_pages-1] != io_page))
+               goto submit_and_retry;
+       if (buffer_uninit(bh))
+               io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
+       io->io_end->size += bh->b_size;
+       io->io_next_block++;
+       ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+       if (ret != bh->b_size)
+               goto submit_and_retry;
+       if ((io_end->num_io_pages == 0) ||
+           (io_end->pages[io_end->num_io_pages-1] != io_page)) {
+               io_end->pages[io_end->num_io_pages++] = io_page;
+               io_page->p_count++;
+       }
+       return 0;
+}
+
+int ext4_bio_write_page(struct ext4_io_submit *io,
+                       struct page *page,
+                       int len,
+                       struct writeback_control *wbc)
+{
+       struct inode *inode = page->mapping->host;
+       unsigned block_start, block_end, blocksize;
+       struct ext4_io_page *io_page;
+       struct buffer_head *bh, *head;
+       int ret = 0;
+
+       blocksize = 1 << inode->i_blkbits;
+
+       BUG_ON(PageWriteback(page));
+       set_page_writeback(page);
+       ClearPageError(page);
+
+       io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
+       if (!io_page) {
+               set_page_dirty(page);
+               unlock_page(page);
+               return -ENOMEM;
+       }
+       io_page->p_page = page;
+       io_page->p_count = 0;
+       get_page(page);
+
+       for (bh = head = page_buffers(page), block_start = 0;
+            bh != head || !block_start;
+            block_start = block_end, bh = bh->b_this_page) {
+               block_end = block_start + blocksize;
+               if (block_start >= len) {
+                       clear_buffer_dirty(bh);
+                       set_buffer_uptodate(bh);
+                       continue;
+               }
+               ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+               if (ret) {
+                       /*
+                        * We only get here on ENOMEM.  Not much else
+                        * we can do but mark the page as dirty, and
+                        * better luck next time.
+                        */
+                       set_page_dirty(page);
+                       break;
+               }
+       }
+       unlock_page(page);
+       /*
+        * If the page was truncated before we could do the writeback,
+        * or we had a memory allocation error while trying to write
+        * the first buffer head, we won't have submitted any pages for
+        * I/O.  In that case we need to make sure we've cleared the
+        * PageWriteback bit from the page to prevent the system from
+        * wedging later on.
+        */
+       if (io_page->p_count == 0) {
+               put_page(page);
+               end_page_writeback(page);
+               kmem_cache_free(io_page_cachep, io_page);
+       }
+       return ret;
+}
index ca5c8aa..dc96392 100644 (file)
@@ -226,23 +226,13 @@ static int setup_new_group_blocks(struct super_block *sb,
        }
 
        /* Zero out all of the reserved backup group descriptor table blocks */
-       for (i = 0, bit = gdblocks + 1, block = start + bit;
-            i < reserved_gdb; i++, block++, bit++) {
-               struct buffer_head *gdb;
-
-               ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
-
-               if ((err = extend_or_restart_transaction(handle, 1, bh)))
-                       goto exit_bh;
+       ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+                       block, sbi->s_itb_per_group);
+       err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
+                              GFP_NOFS);
+       if (err)
+               goto exit_bh;
 
-               if (IS_ERR(gdb = bclean(handle, sb, block))) {
-                       err = PTR_ERR(gdb);
-                       goto exit_bh;
-               }
-               ext4_handle_dirty_metadata(handle, NULL, gdb);
-               ext4_set_bit(bit, bh->b_data);
-               brelse(gdb);
-       }
        ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
                   input->block_bitmap - start);
        ext4_set_bit(input->block_bitmap - start, bh->b_data);
@@ -251,28 +241,18 @@ static int setup_new_group_blocks(struct super_block *sb,
        ext4_set_bit(input->inode_bitmap - start, bh->b_data);
 
        /* Zero out all of the inode table blocks */
-       for (i = 0, block = input->inode_table, bit = block - start;
-            i < sbi->s_itb_per_group; i++, bit++, block++) {
-               struct buffer_head *it;
-
-               ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
-
-               if ((err = extend_or_restart_transaction(handle, 1, bh)))
-                       goto exit_bh;
-
-               if (IS_ERR(it = bclean(handle, sb, block))) {
-                       err = PTR_ERR(it);
-                       goto exit_bh;
-               }
-               ext4_handle_dirty_metadata(handle, NULL, it);
-               brelse(it);
-               ext4_set_bit(bit, bh->b_data);
-       }
+       block = input->inode_table;
+       ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+                       block, sbi->s_itb_per_group);
+       err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
+       if (err)
+               goto exit_bh;
 
        if ((err = extend_or_restart_transaction(handle, 2, bh)))
                goto exit_bh;
 
-       mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
+       ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
+                            bh->b_data);
        ext4_handle_dirty_metadata(handle, NULL, bh);
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
@@ -283,8 +263,8 @@ static int setup_new_group_blocks(struct super_block *sb,
                goto exit_journal;
        }
 
-       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
-                       bh->b_data);
+       ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+                            bh->b_data);
        ext4_handle_dirty_metadata(handle, NULL, bh);
 exit_bh:
        brelse(bh);
index 8ecc1e5..0348ce0 100644 (file)
@@ -40,6 +40,9 @@
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
 
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
 
-struct proc_dir_entry *ext4_proc_root;
+static struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
+struct ext4_lazy_init *ext4_li_info;
+struct mutex ext4_li_mtx;
+struct ext4_features *ext4_feat;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
@@ -69,6 +75,8 @@ static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
 static int ext4_get_sb(struct file_system_type *fs_type, int flags,
                       const char *dev_name, void *data, struct vfsmount *mnt);
+static void ext4_destroy_lazyinit_thread(void);
+static void ext4_unregister_li_request(struct super_block *sb);
 
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
@@ -701,6 +709,7 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
 
+       ext4_unregister_li_request(sb);
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
        flush_workqueue(sbi->dio_unwritten_wq);
@@ -717,6 +726,7 @@ static void ext4_put_super(struct super_block *sb)
                        ext4_abort(sb, "Couldn't clean up the journal");
        }
 
+       del_timer(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);
@@ -1042,6 +1052,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
                seq_puts(seq, ",block_validity");
 
+       if (!test_opt(sb, INIT_INODE_TABLE))
+               seq_puts(seq, ",noinit_inode_table");
+       else if (sbi->s_li_wait_mult)
+               seq_printf(seq, ",init_inode_table=%u",
+                          (unsigned) sbi->s_li_wait_mult);
+
        ext4_show_quota_options(seq, sb);
 
        return 0;
@@ -1170,6 +1186,7 @@ static const struct super_operations ext4_sops = {
        .quota_write    = ext4_quota_write,
 #endif
        .bdev_try_to_free_page = bdev_try_to_free_page,
+       .trim_fs        = ext4_trim_fs
 };
 
 static const struct super_operations ext4_nojournal_sops = {
@@ -1216,6 +1233,7 @@ enum {
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard,
+       Opt_init_inode_table, Opt_noinit_inode_table,
 };
 
 static const match_table_t tokens = {
@@ -1286,6 +1304,9 @@ static const match_table_t tokens = {
        {Opt_dioread_lock, "dioread_lock"},
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
+       {Opt_init_inode_table, "init_itable=%u"},
+       {Opt_init_inode_table, "init_itable"},
+       {Opt_noinit_inode_table, "noinit_itable"},
        {Opt_err, NULL},
 };
 
@@ -1756,6 +1777,20 @@ set_qf_format:
                case Opt_dioread_lock:
                        clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
                        break;
+               case Opt_init_inode_table:
+                       set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                       if (args[0].from) {
+                               if (match_int(&args[0], &option))
+                                       return 0;
+                       } else
+                               option = EXT4_DEF_LI_WAIT_MULT;
+                       if (option < 0)
+                               return 0;
+                       sbi->s_li_wait_mult = option;
+                       break;
+               case Opt_noinit_inode_table:
+                       clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                       break;
                default:
                        ext4_msg(sb, KERN_ERR,
                               "Unrecognized mount option \"%s\" "
@@ -1939,7 +1974,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
 }
 
 /* Called at mount-time, super-block is locked */
-static int ext4_check_descriptors(struct super_block *sb)
+static int ext4_check_descriptors(struct super_block *sb,
+                                 ext4_group_t *first_not_zeroed)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1948,7 +1984,7 @@ static int ext4_check_descriptors(struct super_block *sb)
        ext4_fsblk_t inode_bitmap;
        ext4_fsblk_t inode_table;
        int flexbg_flag = 0;
-       ext4_group_t i;
+       ext4_group_t i, grp = sbi->s_groups_count;
 
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                flexbg_flag = 1;
@@ -1964,6 +2000,10 @@ static int ext4_check_descriptors(struct super_block *sb)
                        last_block = first_block +
                                (EXT4_BLOCKS_PER_GROUP(sb) - 1);
 
+               if ((grp == sbi->s_groups_count) &&
+                  !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                       grp = i;
+
                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -2001,6 +2041,8 @@ static int ext4_check_descriptors(struct super_block *sb)
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
+       if (NULL != first_not_zeroed)
+               *first_not_zeroed = grp;
 
        ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
        sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@ -2373,6 +2415,7 @@ static struct ext4_attr ext4_attr_##_name = {                     \
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 
+#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
 #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
 #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)      \
@@ -2409,6 +2452,16 @@ static struct attribute *ext4_attrs[] = {
        NULL,
 };
 
+/* Features this copy of ext4 supports */
+EXT4_INFO_ATTR(lazy_itable_init);
+EXT4_INFO_ATTR(batched_discard);
+
+static struct attribute *ext4_feat_attrs[] = {
+       ATTR_LIST(lazy_itable_init),
+       ATTR_LIST(batched_discard),
+       NULL,
+};
+
 static ssize_t ext4_attr_show(struct kobject *kobj,
                              struct attribute *attr, char *buf)
 {
@@ -2437,7 +2490,6 @@ static void ext4_sb_release(struct kobject *kobj)
        complete(&sbi->s_kobj_unregister);
 }
 
-
 static const struct sysfs_ops ext4_attr_ops = {
        .show   = ext4_attr_show,
        .store  = ext4_attr_store,
@@ -2449,6 +2501,17 @@ static struct kobj_type ext4_ktype = {
        .release        = ext4_sb_release,
 };
 
+static void ext4_feat_release(struct kobject *kobj)
+{
+       complete(&ext4_feat->f_kobj_unregister);
+}
+
+static struct kobj_type ext4_feat_ktype = {
+       .default_attrs  = ext4_feat_attrs,
+       .sysfs_ops      = &ext4_attr_ops,
+       .release        = ext4_feat_release,
+};
+
 /*
  * Check whether this filesystem can be mounted based on
  * the features present and the RDONLY/RDWR mount requested.
@@ -2539,6 +2602,372 @@ static void print_daily_error_info(unsigned long arg)
        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
 }
 
+static void ext4_lazyinode_timeout(unsigned long data)
+{
+       struct task_struct *p = (struct task_struct *)data;
+       wake_up_process(p);
+}
+
+/* Find next suitable group and run ext4_init_inode_table */
+static int ext4_run_li_request(struct ext4_li_request *elr)
+{
+       struct ext4_group_desc *gdp = NULL;
+       ext4_group_t group, ngroups;
+       struct super_block *sb;
+       unsigned long timeout = 0;
+       int ret = 0;
+
+       sb = elr->lr_super;
+       ngroups = EXT4_SB(sb)->s_groups_count;
+
+       for (group = elr->lr_next_group; group < ngroups; group++) {
+               gdp = ext4_get_group_desc(sb, group, NULL);
+               if (!gdp) {
+                       ret = 1;
+                       break;
+               }
+
+               if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                       break;
+       }
+
+       if (group == ngroups)
+               ret = 1;
+
+       if (!ret) {
+               timeout = jiffies;
+               ret = ext4_init_inode_table(sb, group,
+                                           elr->lr_timeout ? 0 : 1);
+               if (elr->lr_timeout == 0) {
+                       timeout = jiffies - timeout;
+                       if (elr->lr_sbi->s_li_wait_mult)
+                               timeout *= elr->lr_sbi->s_li_wait_mult;
+                       else
+                               timeout *= 20;
+                       elr->lr_timeout = timeout;
+               }
+               elr->lr_next_sched = jiffies + elr->lr_timeout;
+               elr->lr_next_group = group + 1;
+       }
+
+       return ret;
+}
+
+/*
+ * Remove lr_request from the list_request and free the
+ * request tructure. Should be called with li_list_mtx held
+ */
+static void ext4_remove_li_request(struct ext4_li_request *elr)
+{
+       struct ext4_sb_info *sbi;
+
+       if (!elr)
+               return;
+
+       sbi = elr->lr_sbi;
+
+       list_del(&elr->lr_request);
+       sbi->s_li_request = NULL;
+       kfree(elr);
+}
+
+static void ext4_unregister_li_request(struct super_block *sb)
+{
+       struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
+
+       if (!ext4_li_info)
+               return;
+
+       mutex_lock(&ext4_li_info->li_list_mtx);
+       ext4_remove_li_request(elr);
+       mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+
+/*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+ * When such a fs is found, run the lazy initialization request
+ * (ext4_rn_li_request) and keep track of the time spend in this
+ * function. Based on that time we compute next schedule time of
+ * the request. When walking through the list is complete, compute
+ * next waking time and put itself into sleep.
+ */
+static int ext4_lazyinit_thread(void *arg)
+{
+       struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+       struct list_head *pos, *n;
+       struct ext4_li_request *elr;
+       unsigned long next_wakeup;
+       DEFINE_WAIT(wait);
+       int ret;
+
+       BUG_ON(NULL == eli);
+
+       eli->li_timer.data = (unsigned long)current;
+       eli->li_timer.function = ext4_lazyinode_timeout;
+
+       eli->li_task = current;
+       wake_up(&eli->li_wait_task);
+
+cont_thread:
+       while (true) {
+               next_wakeup = MAX_JIFFY_OFFSET;
+
+               mutex_lock(&eli->li_list_mtx);
+               if (list_empty(&eli->li_request_list)) {
+                       mutex_unlock(&eli->li_list_mtx);
+                       goto exit_thread;
+               }
+
+               list_for_each_safe(pos, n, &eli->li_request_list) {
+                       elr = list_entry(pos, struct ext4_li_request,
+                                        lr_request);
+
+                       if (time_after_eq(jiffies, elr->lr_next_sched))
+                               ret = ext4_run_li_request(elr);
+
+                       if (ret) {
+                               ret = 0;
+                               ext4_remove_li_request(elr);
+                               continue;
+                       }
+
+                       if (time_before(elr->lr_next_sched, next_wakeup))
+                               next_wakeup = elr->lr_next_sched;
+               }
+               mutex_unlock(&eli->li_list_mtx);
+
+               if (freezing(current))
+                       refrigerator();
+
+               if (time_after_eq(jiffies, next_wakeup)) {
+                       cond_resched();
+                       continue;
+               }
+
+               eli->li_timer.expires = next_wakeup;
+               add_timer(&eli->li_timer);
+               prepare_to_wait(&eli->li_wait_daemon, &wait,
+                               TASK_INTERRUPTIBLE);
+               if (time_before(jiffies, next_wakeup))
+                       schedule();
+               finish_wait(&eli->li_wait_daemon, &wait);
+       }
+
+exit_thread:
+       /*
+        * It looks like the request list is empty, but we need
+        * to check it under the li_list_mtx lock, to prevent any
+        * additions into it, and of course we should lock ext4_li_mtx
+        * to atomically free the list and ext4_li_info, because at
+        * this point another ext4 filesystem could be registering
+        * new one.
+        */
+       mutex_lock(&ext4_li_mtx);
+       mutex_lock(&eli->li_list_mtx);
+       if (!list_empty(&eli->li_request_list)) {
+               mutex_unlock(&eli->li_list_mtx);
+               mutex_unlock(&ext4_li_mtx);
+               goto cont_thread;
+       }
+       mutex_unlock(&eli->li_list_mtx);
+       del_timer_sync(&ext4_li_info->li_timer);
+       eli->li_task = NULL;
+       wake_up(&eli->li_wait_task);
+
+       kfree(ext4_li_info);
+       ext4_li_info = NULL;
+       mutex_unlock(&ext4_li_mtx);
+
+       return 0;
+}
+
+static void ext4_clear_request_list(void)
+{
+       struct list_head *pos, *n;
+       struct ext4_li_request *elr;
+
+       mutex_lock(&ext4_li_info->li_list_mtx);
+       if (list_empty(&ext4_li_info->li_request_list))
+               return;
+
+       list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+               elr = list_entry(pos, struct ext4_li_request,
+                                lr_request);
+               ext4_remove_li_request(elr);
+       }
+       mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+
+static int ext4_run_lazyinit_thread(void)
+{
+       struct task_struct *t;
+
+       t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+       if (IS_ERR(t)) {
+               int err = PTR_ERR(t);
+               ext4_clear_request_list();
+               del_timer_sync(&ext4_li_info->li_timer);
+               kfree(ext4_li_info);
+               ext4_li_info = NULL;
+               printk(KERN_CRIT "EXT4: error %d creating inode table "
+                                "initialization thread\n",
+                                err);
+               return err;
+       }
+       ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+
+       wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
+       return 0;
+}
+
+/*
+ * Check whether it make sense to run itable init. thread or not.
+ * If there is at least one uninitialized inode table, return
+ * corresponding group number, else the loop goes through all
+ * groups and return total number of groups.
+ */
+static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+{
+       ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+       struct ext4_group_desc *gdp = NULL;
+
+       for (group = 0; group < ngroups; group++) {
+               gdp = ext4_get_group_desc(sb, group, NULL);
+               if (!gdp)
+                       continue;
+
+               if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                       break;
+       }
+
+       return group;
+}
+
+static int ext4_li_info_new(void)
+{
+       struct ext4_lazy_init *eli = NULL;
+
+       eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+       if (!eli)
+               return -ENOMEM;
+
+       eli->li_task = NULL;
+       INIT_LIST_HEAD(&eli->li_request_list);
+       mutex_init(&eli->li_list_mtx);
+
+       init_waitqueue_head(&eli->li_wait_daemon);
+       init_waitqueue_head(&eli->li_wait_task);
+       init_timer(&eli->li_timer);
+       eli->li_state |= EXT4_LAZYINIT_QUIT;
+
+       ext4_li_info = eli;
+
+       return 0;
+}
+
+static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+                                           ext4_group_t start)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_li_request *elr;
+       unsigned long rnd;
+
+       elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+       if (!elr)
+               return NULL;
+
+       elr->lr_super = sb;
+       elr->lr_sbi = sbi;
+       elr->lr_next_group = start;
+
+       /*
+        * Randomize first schedule time of the request to
+        * spread the inode table initialization requests
+        * better.
+        */
+       get_random_bytes(&rnd, sizeof(rnd));
+       elr->lr_next_sched = jiffies + (unsigned long)rnd %
+                            (EXT4_DEF_LI_MAX_START_DELAY * HZ);
+
+       return elr;
+}
+
+static int ext4_register_li_request(struct super_block *sb,
+                                   ext4_group_t first_not_zeroed)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_li_request *elr;
+       ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+       int ret;
+
+       if (sbi->s_li_request != NULL)
+               return 0;
+
+       if (first_not_zeroed == ngroups ||
+           (sb->s_flags & MS_RDONLY) ||
+           !test_opt(sb, INIT_INODE_TABLE)) {
+               sbi->s_li_request = NULL;
+               return 0;
+       }
+
+       if (first_not_zeroed == ngroups) {
+               sbi->s_li_request = NULL;
+               return 0;
+       }
+
+       elr = ext4_li_request_new(sb, first_not_zeroed);
+       if (!elr)
+               return -ENOMEM;
+
+       mutex_lock(&ext4_li_mtx);
+
+       if (NULL == ext4_li_info) {
+               ret = ext4_li_info_new();
+               if (ret)
+                       goto out;
+       }
+
+       mutex_lock(&ext4_li_info->li_list_mtx);
+       list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+       mutex_unlock(&ext4_li_info->li_list_mtx);
+
+       sbi->s_li_request = elr;
+
+       if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+               ret = ext4_run_lazyinit_thread();
+               if (ret)
+                       goto out;
+       }
+out:
+       mutex_unlock(&ext4_li_mtx);
+       if (ret)
+               kfree(elr);
+       return ret;
+}
+
+/*
+ * We do not need to lock anything since this is called on
+ * module unload.
+ */
+static void ext4_destroy_lazyinit_thread(void)
+{
+       /*
+        * If thread exited earlier
+        * there's nothing to be done.
+        */
+       if (!ext4_li_info)
+               return;
+
+       ext4_clear_request_list();
+
+       while (ext4_li_info->li_task) {
+               wake_up(&ext4_li_info->li_wait_daemon);
+               wait_event(ext4_li_info->li_wait_task,
+                          ext4_li_info->li_task == NULL);
+       }
+}
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
@@ -2564,6 +2993,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        __u64 blocks_count;
        int err;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+       ext4_group_t first_not_zeroed;
 
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
@@ -2624,6 +3054,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+       set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
                set_opt(sbi->s_mount_opt, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
@@ -2901,7 +3332,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        goto failed_mount2;
                }
        }
-       if (!ext4_check_descriptors(sb)) {
+       if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                goto failed_mount2;
        }
@@ -3122,6 +3553,10 @@ no_journal:
                goto failed_mount4;
        }
 
+       err = ext4_register_li_request(sb, first_not_zeroed);
+       if (err)
+               goto failed_mount4;
+
        sbi->s_kobj.kset = ext4_kset;
        init_completion(&sbi->s_kobj_unregister);
        err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@ -3461,7 +3896,7 @@ static int ext4_load_journal(struct super_block *sb,
        EXT4_SB(sb)->s_journal = journal;
        ext4_clear_journal_err(sb, es);
 
-       if (journal_devnum &&
+       if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
 
@@ -3514,9 +3949,12 @@ static int ext4_commit_super(struct super_block *sb, int sync)
        else
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-       ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+       if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter))
+               ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeblocks_counter));
-       es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+       if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
+               es->s_free_inodes_count =
+                       cpu_to_le32(percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeinodes_counter));
        sb->s_dirt = 0;
        BUFFER_TRACE(sbh, "marking dirty");
@@ -3835,6 +4273,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        enable_quota = 1;
                }
        }
+
+       /*
+        * Reinitialize lazy itable initialization thread based on
+        * current settings
+        */
+       if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+               ext4_unregister_li_request(sb);
+       else {
+               ext4_group_t first_not_zeroed;
+               first_not_zeroed = ext4_has_uninit_itable(sb);
+               ext4_register_li_request(sb, first_not_zeroed);
+       }
+
        ext4_setup_system_zone(sb);
        if (sbi->s_journal == NULL)
                ext4_commit_super(sb, 1);
@@ -4276,23 +4727,53 @@ static struct file_system_type ext4_fs_type = {
        .fs_flags       = FS_REQUIRES_DEV,
 };
 
-static int __init init_ext4_fs(void)
+int __init ext4_init_feat_adverts(void)
+{
+       struct ext4_features *ef;
+       int ret = -ENOMEM;
+
+       ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+       if (!ef)
+               goto out;
+
+       ef->f_kobj.kset = ext4_kset;
+       init_completion(&ef->f_kobj_unregister);
+       ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+                                  "features");
+       if (ret) {
+               kfree(ef);
+               goto out;
+       }
+
+       ext4_feat = ef;
+       ret = 0;
+out:
+       return ret;
+}
+
+static int __init ext4_init_fs(void)
 {
        int err;
 
        ext4_check_flag_values();
-       err = init_ext4_system_zone();
+       err = ext4_init_pageio();
        if (err)
                return err;
+       err = ext4_init_system_zone();
+       if (err)
+               goto out5;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
                goto out4;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
-       err = init_ext4_mballoc();
+
+       err = ext4_init_feat_adverts();
+
+       err = ext4_init_mballoc();
        if (err)
                goto out3;
 
-       err = init_ext4_xattr();
+       err = ext4_init_xattr();
        if (err)
                goto out2;
        err = init_inodecache();
@@ -4303,38 +4784,46 @@ static int __init init_ext4_fs(void)
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;
+
+       ext4_li_info = NULL;
+       mutex_init(&ext4_li_mtx);
        return 0;
 out:
        unregister_as_ext2();
        unregister_as_ext3();
        destroy_inodecache();
 out1:
-       exit_ext4_xattr();
+       ext4_exit_xattr();
 out2:
-       exit_ext4_mballoc();
+       ext4_exit_mballoc();
 out3:
+       kfree(ext4_feat);
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
 out4:
-       exit_ext4_system_zone();
+       ext4_exit_system_zone();
+out5:
+       ext4_exit_pageio();
        return err;
 }
 
-static void __exit exit_ext4_fs(void)
+static void __exit ext4_exit_fs(void)
 {
+       ext4_destroy_lazyinit_thread();
        unregister_as_ext2();
        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        destroy_inodecache();
-       exit_ext4_xattr();
-       exit_ext4_mballoc();
+       ext4_exit_xattr();
+       ext4_exit_mballoc();
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
-       exit_ext4_system_zone();
+       ext4_exit_system_zone();
+       ext4_exit_pageio();
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
 MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
-module_init(init_ext4_fs)
-module_exit(exit_ext4_fs)
+module_init(ext4_init_fs)
+module_exit(ext4_exit_fs)
index 3a8cd8d..fa4b899 100644 (file)
@@ -1588,7 +1588,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 #undef BLOCK_HASH_SHIFT
 
 int __init
-init_ext4_xattr(void)
+ext4_init_xattr(void)
 {
        ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
        if (!ext4_xattr_cache)
@@ -1597,7 +1597,7 @@ init_ext4_xattr(void)
 }
 
 void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
        if (ext4_xattr_cache)
                mb_cache_destroy(ext4_xattr_cache);
index 518e96e..281dd83 100644 (file)
@@ -83,8 +83,8 @@ extern void ext4_xattr_put_super(struct super_block *);
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                            struct ext4_inode *raw_inode, handle_t *handle);
 
-extern int init_ext4_xattr(void);
-extern void exit_ext4_xattr(void);
+extern int __init ext4_init_xattr(void);
+extern void ext4_exit_xattr(void);
 
 extern const struct xattr_handler *ext4_xattr_handlers[];
 
@@ -121,14 +121,14 @@ ext4_xattr_put_super(struct super_block *sb)
 {
 }
 
-static inline int
+static __init inline int
 init_ext4_xattr(void)
 {
        return 0;
 }
 
 static inline void
-exit_ext4_xattr(void)
+ext4_exit_xattr(void)
 {
 }
 
index f855ea4..e92fdbb 100644 (file)
@@ -530,6 +530,41 @@ static int ioctl_fsthaw(struct file *filp)
        return thaw_super(sb);
 }
 
+static int ioctl_fstrim(struct file *filp, void __user *argp)
+{
+       struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+       struct fstrim_range range;
+       int ret = 0;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       /* If filesystem doesn't support trim feature, return. */
+       if (sb->s_op->trim_fs == NULL)
+               return -EOPNOTSUPP;
+
+       /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
+       if (sb->s_bdev == NULL)
+               return -EINVAL;
+
+       if (argp == NULL) {
+               range.start = 0;
+               range.len = ULLONG_MAX;
+               range.minlen = 0;
+       } else if (copy_from_user(&range, argp, sizeof(range)))
+               return -EFAULT;
+
+       ret = sb->s_op->trim_fs(sb, &range);
+       if (ret < 0)
+               return ret;
+
+       if ((argp != NULL) &&
+           (copy_to_user(argp, &range, sizeof(range))))
+               return -EFAULT;
+
+       return 0;
+}
+
 /*
  * When you add any new common ioctls to the switches above and below
  * please update compat_sys_ioctl() too.
@@ -580,6 +615,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                error = ioctl_fsthaw(filp);
                break;
 
+       case FITRIM:
+               error = ioctl_fstrim(filp, argp);
+               break;
+
        case FS_IOC_FIEMAP:
                return ioctl_fiemap(filp, arg);
 
index 6571a05..6a79fd0 100644 (file)
@@ -299,6 +299,16 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                transaction->t_chp_stats.cs_forced_to_close++;
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
+               if (unlikely(journal->j_flags & JBD2_UNMOUNT))
+                       /*
+                        * The journal thread is dead; so starting and
+                        * waiting for a commit to finish will cause
+                        * us to wait for a _very_ long time.
+                        */
+                       printk(KERN_ERR "JBD2: %s: "
+                              "Waiting for Godot: block %llu\n",
+                              journal->j_devname,
+                              (unsigned long long) bh->b_blocknr);
                jbd2_log_start_commit(journal, tid);
                jbd2_log_wait_commit(journal, tid);
                ret = 1;
index bc6be8b..f3ad159 100644 (file)
@@ -26,7 +26,9 @@
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/bitops.h>
 #include <trace/events/jbd2.h>
+#include <asm/system.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -201,7 +203,7 @@ static int journal_submit_data_buffers(journal_t *journal,
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
                mapping = jinode->i_vfs_inode->i_mapping;
-               jinode->i_flags |= JI_COMMIT_RUNNING;
+               set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                spin_unlock(&journal->j_list_lock);
                /*
                 * submit the inode data buffers. We use writepage
@@ -216,7 +218,8 @@ static int journal_submit_data_buffers(journal_t *journal,
                spin_lock(&journal->j_list_lock);
                J_ASSERT(jinode->i_transaction == commit_transaction);
                commit_transaction->t_flushed_data_blocks = 1;
-               jinode->i_flags &= ~JI_COMMIT_RUNNING;
+               clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+               smp_mb__after_clear_bit();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
        spin_unlock(&journal->j_list_lock);
@@ -237,7 +240,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
        /* For locking, see the comment in journal_submit_data_buffers() */
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-               jinode->i_flags |= JI_COMMIT_RUNNING;
+               set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                spin_unlock(&journal->j_list_lock);
                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
                if (err) {
@@ -253,7 +256,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
                                ret = err;
                }
                spin_lock(&journal->j_list_lock);
-               jinode->i_flags &= ~JI_COMMIT_RUNNING;
+               clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+               smp_mb__after_clear_bit();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
 
index 168d189..538417c 100644 (file)
 #include <linux/log2.h>
 #include <linux/vmalloc.h>
 #include <linux/backing-dev.h>
+#include <linux/bitops.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
+#include <asm/system.h>
 
 EXPORT_SYMBOL(jbd2_journal_extend);
 EXPORT_SYMBOL(jbd2_journal_stop);
@@ -2210,7 +2212,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal,
 restart:
        spin_lock(&journal->j_list_lock);
        /* Is commit writing out inode - we have to wait */
-       if (jinode->i_flags & JI_COMMIT_RUNNING) {
+       if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
                wait_queue_head_t *wq;
                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
index f3479d6..6bf0a24 100644 (file)
@@ -156,6 +156,7 @@ alloc_transaction:
         */
 repeat:
        read_lock(&journal->j_state_lock);
+       BUG_ON(journal->j_flags & JBD2_UNMOUNT);
        if (is_journal_aborted(journal) ||
            (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
                read_unlock(&journal->j_state_lock);
index 646b462..5027a59 100644 (file)
@@ -891,6 +891,14 @@ static inline int sb_issue_discard(struct super_block *sb, sector_t block,
                                    nr_blocks << (sb->s_blocksize_bits - 9),
                                    gfp_mask, flags);
 }
+static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
+               sector_t nr_blocks, gfp_t gfp_mask)
+{
+       return blkdev_issue_zeroout(sb->s_bdev,
+                                   block << (sb->s_blocksize_bits - 9),
+                                   nr_blocks << (sb->s_blocksize_bits - 9),
+                                   gfp_mask);
+}
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
 
index b2a6009..6ed7ace 100644 (file)
 #define SEEK_END       2       /* seek relative to end of file */
 #define SEEK_MAX       SEEK_END
 
+struct fstrim_range {
+       uint64_t start;
+       uint64_t len;
+       uint64_t minlen;
+};
+
 /* And dynamically-tunable limits and defaults: */
 struct files_stat_struct {
        unsigned long nr_files;         /* read only */
@@ -317,6 +323,7 @@ struct inodes_stat_t {
 #define FIGETBSZ   _IO(0x00,2) /* get the block size used for bmap */
 #define FIFREEZE       _IOWR('X', 119, int)    /* Freeze */
 #define FITHAW         _IOWR('X', 120, int)    /* Thaw */
+#define FITRIM         _IOWR('X', 121, struct fstrim_range)    /* Trim */
 
 #define        FS_IOC_GETFLAGS                 _IOR('f', 1, long)
 #define        FS_IOC_SETFLAGS                 _IOW('f', 2, long)
@@ -1604,6 +1611,7 @@ struct super_operations {
        ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 #endif
        int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
+       int (*trim_fs) (struct super_block *, struct fstrim_range *);
 };
 
 /*
index 0b52924..2ae86aa 100644 (file)
@@ -395,7 +395,7 @@ struct jbd2_inode {
        struct inode *i_vfs_inode;
 
        /* Flags of inode [j_list_lock] */
-       unsigned int i_flags;
+       unsigned long i_flags;
 };
 
 struct jbd2_revoke_table_s;
index 8a7d510..46f6ba5 100644 (file)
@@ -78,6 +78,11 @@ static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
        return 1;
 }
 
+static inline int percpu_counter_initialized(struct percpu_counter *fbc)
+{
+       return (fbc->counters != NULL);
+}
+
 #else
 
 struct percpu_counter {
@@ -143,6 +148,11 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
        return percpu_counter_read(fbc);
 }
 
+static inline int percpu_counter_initialized(struct percpu_counter *fbc)
+{
+       return 1;
+}
+
 #endif /* CONFIG_SMP */
 
 static inline void percpu_counter_inc(struct percpu_counter *fbc)
index d5c7aaa..09eec35 100644 (file)
@@ -141,6 +141,8 @@ typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
 
 int generic_writepages(struct address_space *mapping,
                       struct writeback_control *wbc);
+void tag_pages_for_writeback(struct address_space *mapping,
+                            pgoff_t start, pgoff_t end);
 int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
                      void *data);
index 6bcb006..289010d 100644 (file)
@@ -21,7 +21,8 @@ TRACE_EVENT(ext4_free_inode,
        TP_ARGS(inode),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        umode_t, mode                   )
                __field(        uid_t,  uid                     )
@@ -30,7 +31,8 @@ TRACE_EVENT(ext4_free_inode,
        ),
 
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->mode   = inode->i_mode;
                __entry->uid    = inode->i_uid;
@@ -38,9 +40,10 @@ TRACE_EVENT(ext4_free_inode,
                __entry->blocks = inode->i_blocks;
        ),
 
-       TP_printk("dev %s ino %lu mode 0%o uid %u gid %u blocks %llu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->mode, __entry->uid, __entry->gid,
+       TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->mode,
+                 __entry->uid, __entry->gid,
                  (unsigned long long) __entry->blocks)
 );
 
@@ -50,20 +53,22 @@ TRACE_EVENT(ext4_request_inode,
        TP_ARGS(dir, mode),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  dir                     )
                __field(        umode_t, mode                   )
        ),
 
        TP_fast_assign(
-               __entry->dev    = dir->i_sb->s_dev;
+               __entry->dev_major = MAJOR(dir->i_sb->s_dev);
+               __entry->dev_minor = MINOR(dir->i_sb->s_dev);
                __entry->dir    = dir->i_ino;
                __entry->mode   = mode;
        ),
 
-       TP_printk("dev %s dir %lu mode 0%o",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir,
-                 __entry->mode)
+       TP_printk("dev %d,%d dir %lu mode 0%o",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->dir, __entry->mode)
 );
 
 TRACE_EVENT(ext4_allocate_inode,
@@ -72,21 +77,24 @@ TRACE_EVENT(ext4_allocate_inode,
        TP_ARGS(inode, dir, mode),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        ino_t,  dir                     )
                __field(        umode_t, mode                   )
        ),
 
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->dir    = dir->i_ino;
                __entry->mode   = mode;
        ),
 
-       TP_printk("dev %s ino %lu dir %lu mode 0%o",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  (unsigned long) __entry->dir, __entry->mode)
 );
 
@@ -98,7 +106,8 @@ DECLARE_EVENT_CLASS(ext4__write_begin,
        TP_ARGS(inode, pos, len, flags),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        loff_t, pos                     )
                __field(        unsigned int, len               )
@@ -106,15 +115,17 @@ DECLARE_EVENT_CLASS(ext4__write_begin,
        ),
 
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->pos    = pos;
                __entry->len    = len;
                __entry->flags  = flags;
        ),
 
-       TP_printk("dev %s ino %lu pos %llu len %u flags %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->pos, __entry->len, __entry->flags)
 );
 
@@ -141,7 +152,8 @@ DECLARE_EVENT_CLASS(ext4__write_end,
        TP_ARGS(inode, pos, len, copied),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        loff_t, pos                     )
                __field(        unsigned int, len               )
@@ -149,16 +161,18 @@ DECLARE_EVENT_CLASS(ext4__write_end,
        ),
 
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->pos    = pos;
                __entry->len    = len;
                __entry->copied = copied;
        ),
 
-       TP_printk("dev %s ino %lu pos %llu len %u copied %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->pos, __entry->len, __entry->copied)
+       TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->pos,
+                 __entry->len, __entry->copied)
 );
 
 DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end,
@@ -199,21 +213,23 @@ TRACE_EVENT(ext4_writepage,
        TP_ARGS(inode, page),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        pgoff_t, index                  )
 
        ),
 
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->index  = page->index;
        ),
 
-       TP_printk("dev %s ino %lu page_index %lu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->index)
+       TP_printk("dev %d,%d ino %lu page_index %lu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->index)
 );
 
 TRACE_EVENT(ext4_da_writepages,
@@ -222,13 +238,13 @@ TRACE_EVENT(ext4_da_writepages,
        TP_ARGS(inode, wbc),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        long,   nr_to_write             )
                __field(        long,   pages_skipped           )
                __field(        loff_t, range_start             )
                __field(        loff_t, range_end               )
-               __field(        char,   nonblocking             )
                __field(        char,   for_kupdate             )
                __field(        char,   for_reclaim             )
                __field(        char,   range_cyclic            )
@@ -236,7 +252,8 @@ TRACE_EVENT(ext4_da_writepages,
        ),
 
        TP_fast_assign(
-               __entry->dev            = inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(inode->i_sb->s_dev);
                __entry->ino            = inode->i_ino;
                __entry->nr_to_write    = wbc->nr_to_write;
                __entry->pages_skipped  = wbc->pages_skipped;
@@ -248,11 +265,11 @@ TRACE_EVENT(ext4_da_writepages,
                __entry->writeback_index = inode->i_mapping->writeback_index;
        ),
 
-       TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld "
+       TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
                  "range_start %llu range_end %llu "
                  "for_kupdate %d for_reclaim %d "
                  "range_cyclic %d writeback_index %lu",
-                 jbd2_dev_to_name(__entry->dev),
+                 __entry->dev_major, __entry->dev_minor,
                  (unsigned long) __entry->ino, __entry->nr_to_write,
                  __entry->pages_skipped, __entry->range_start,
                  __entry->range_end,
@@ -267,7 +284,8 @@ TRACE_EVENT(ext4_da_write_pages,
        TP_ARGS(inode, mpd),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        __u64,  b_blocknr               )
                __field(        __u32,  b_size                  )
@@ -278,7 +296,8 @@ TRACE_EVENT(ext4_da_write_pages,
        ),
 
        TP_fast_assign(
-               __entry->dev            = inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(inode->i_sb->s_dev);
                __entry->ino            = inode->i_ino;
                __entry->b_blocknr      = mpd->b_blocknr;
                __entry->b_size         = mpd->b_size;
@@ -288,8 +307,9 @@ TRACE_EVENT(ext4_da_write_pages,
                __entry->pages_written  = mpd->pages_written;
        ),
 
-       TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->b_blocknr, __entry->b_size,
                  __entry->b_state, __entry->first_page,
                  __entry->io_done, __entry->pages_written)
@@ -302,7 +322,8 @@ TRACE_EVENT(ext4_da_writepages_result,
        TP_ARGS(inode, wbc, ret, pages_written),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        int,    ret                     )
                __field(        int,    pages_written           )
@@ -312,7 +333,8 @@ TRACE_EVENT(ext4_da_writepages_result,
        ),
 
        TP_fast_assign(
-               __entry->dev            = inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(inode->i_sb->s_dev);
                __entry->ino            = inode->i_ino;
                __entry->ret            = ret;
                __entry->pages_written  = pages_written;
@@ -321,8 +343,8 @@ TRACE_EVENT(ext4_da_writepages_result,
                __entry->writeback_index = inode->i_mapping->writeback_index;
        ),
 
-       TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld more_io %d writeback_index %lu",
-                 jbd2_dev_to_name(__entry->dev),
+       TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld more_io %d writeback_index %lu",
+                 __entry->dev_major, __entry->dev_minor,
                  (unsigned long) __entry->ino, __entry->ret,
                  __entry->pages_written, __entry->pages_skipped,
                  __entry->more_io,
@@ -336,20 +358,23 @@ TRACE_EVENT(ext4_discard_blocks,
        TP_ARGS(sb, blk, count),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        __u64,  blk                     )
                __field(        __u64,  count                   )
 
        ),
 
        TP_fast_assign(
-               __entry->dev    = sb->s_dev;
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
                __entry->blk    = blk;
                __entry->count  = count;
        ),
 
-       TP_printk("dev %s blk %llu count %llu",
-                 jbd2_dev_to_name(__entry->dev), __entry->blk, __entry->count)
+       TP_printk("dev %d,%d blk %llu count %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 __entry->blk, __entry->count)
 );
 
 DECLARE_EVENT_CLASS(ext4__mb_new_pa,
@@ -359,7 +384,8 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa,
        TP_ARGS(ac, pa),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        __u64,  pa_pstart               )
                __field(        __u32,  pa_len                  )
@@ -368,16 +394,18 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa,
        ),
 
        TP_fast_assign(
-               __entry->dev            = ac->ac_sb->s_dev;
+               __entry->dev_major      = MAJOR(ac->ac_sb->s_dev);
+               __entry->dev_minor      = MINOR(ac->ac_sb->s_dev);
                __entry->ino            = ac->ac_inode->i_ino;
                __entry->pa_pstart      = pa->pa_pstart;
                __entry->pa_len         = pa->pa_len;
                __entry->pa_lstart      = pa->pa_lstart;
        ),
 
-       TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
+       TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->pa_pstart,
+                 __entry->pa_len, __entry->pa_lstart)
 );
 
 DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa,
@@ -398,14 +426,15 @@ DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa,
 
 TRACE_EVENT(ext4_mb_release_inode_pa,
        TP_PROTO(struct super_block *sb,
-                struct ext4_allocation_context *ac,
+                struct inode *inode,
                 struct ext4_prealloc_space *pa,
                 unsigned long long block, unsigned int count),
 
-       TP_ARGS(sb, ac, pa, block, count),
+       TP_ARGS(sb, inode, pa, block, count),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        __u64,  block                   )
                __field(        __u32,  count                   )
@@ -413,43 +442,42 @@ TRACE_EVENT(ext4_mb_release_inode_pa,
        ),
 
        TP_fast_assign(
-               __entry->dev            = sb->s_dev;
-               __entry->ino            = (ac && ac->ac_inode) ? 
-                                               ac->ac_inode->i_ino : 0;
+               __entry->dev_major      = MAJOR(sb->s_dev);
+               __entry->dev_minor      = MINOR(sb->s_dev);
+               __entry->ino            = inode->i_ino;
                __entry->block          = block;
                __entry->count          = count;
        ),
 
-       TP_printk("dev %s ino %lu block %llu count %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->block, __entry->count)
+       TP_printk("dev %d,%d ino %lu block %llu count %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->block, __entry->count)
 );
 
 TRACE_EVENT(ext4_mb_release_group_pa,
        TP_PROTO(struct super_block *sb,
-                struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),
 
-       TP_ARGS(sb, ac, pa),
+       TP_ARGS(sb, pa),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        ino_t,  ino                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        __u64,  pa_pstart               )
                __field(        __u32,  pa_len                  )
 
        ),
 
        TP_fast_assign(
-               __entry->dev            = sb->s_dev;
-               __entry->ino            = (ac && ac->ac_inode) ?
-                                               ac->ac_inode->i_ino : 0;
+               __entry->dev_major      = MAJOR(sb->s_dev);
+               __entry->dev_minor      = MINOR(sb->s_dev);
                __entry->pa_pstart      = pa->pa_pstart;
                __entry->pa_len         = pa->pa_len;
        ),
 
-       TP_printk("dev %s pstart %llu len %u",
-                 jbd2_dev_to_name(__entry->dev), __entry->pa_pstart, __entry->pa_len)
+       TP_printk("dev %d,%d pstart %llu len %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 __entry->pa_pstart, __entry->pa_len)
 );
 
 TRACE_EVENT(ext4_discard_preallocations,
@@ -458,18 +486,21 @@ TRACE_EVENT(ext4_discard_preallocations,
        TP_ARGS(inode),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
 
        ),
 
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
        ),
 
-       TP_printk("dev %s ino %lu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino)
+       TP_printk("dev %d,%d ino %lu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino)
 );
 
 TRACE_EVENT(ext4_mb_discard_preallocations,
@@ -478,18 +509,20 @@ TRACE_EVENT(ext4_mb_discard_preallocations,
        TP_ARGS(sb, needed),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        int,    needed                  )
 
        ),
 
        TP_fast_assign(
-               __entry->dev    = sb->s_dev;
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
                __entry->needed = needed;
        ),
 
-       TP_printk("dev %s needed %d",
-                 jbd2_dev_to_name(__entry->dev), __entry->needed)
+       TP_printk("dev %d,%d needed %d",
+                 __entry->dev_major, __entry->dev_minor, __entry->needed)
 );
 
 TRACE_EVENT(ext4_request_blocks,
@@ -498,7 +531,8 @@ TRACE_EVENT(ext4_request_blocks,
        TP_ARGS(ar),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        unsigned int, flags             )
                __field(        unsigned int, len               )
@@ -511,7 +545,8 @@ TRACE_EVENT(ext4_request_blocks,
        ),
 
        TP_fast_assign(
-               __entry->dev    = ar->inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(ar->inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(ar->inode->i_sb->s_dev);
                __entry->ino    = ar->inode->i_ino;
                __entry->flags  = ar->flags;
                __entry->len    = ar->len;
@@ -523,8 +558,9 @@ TRACE_EVENT(ext4_request_blocks,
                __entry->pright = ar->pright;
        ),
 
-       TP_printk("dev %s ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->flags, __entry->len,
                  (unsigned long long) __entry->logical,
                  (unsigned long long) __entry->goal,
@@ -540,7 +576,8 @@ TRACE_EVENT(ext4_allocate_blocks,
        TP_ARGS(ar, block),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        __u64,  block                   )
                __field(        unsigned int, flags             )
@@ -554,7 +591,8 @@ TRACE_EVENT(ext4_allocate_blocks,
        ),
 
        TP_fast_assign(
-               __entry->dev    = ar->inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(ar->inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(ar->inode->i_sb->s_dev);
                __entry->ino    = ar->inode->i_ino;
                __entry->block  = block;
                __entry->flags  = ar->flags;
@@ -567,9 +605,10 @@ TRACE_EVENT(ext4_allocate_blocks,
                __entry->pright = ar->pright;
        ),
 
-       TP_printk("dev %s ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->flags, __entry->len, __entry->block,
+       TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->flags,
+                 __entry->len, __entry->block,
                  (unsigned long long) __entry->logical,
                  (unsigned long long) __entry->goal,
                  (unsigned long long) __entry->lleft,
@@ -585,7 +624,8 @@ TRACE_EVENT(ext4_free_blocks,
        TP_ARGS(inode, block, count, flags),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(      umode_t, mode                     )
                __field(        __u64,  block                   )
@@ -594,7 +634,8 @@ TRACE_EVENT(ext4_free_blocks,
        ),
 
        TP_fast_assign(
-               __entry->dev            = inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(inode->i_sb->s_dev);
                __entry->ino            = inode->i_ino;
                __entry->mode           = inode->i_mode;
                __entry->block          = block;
@@ -602,8 +643,9 @@ TRACE_EVENT(ext4_free_blocks,
                __entry->flags          = flags;
        ),
 
-       TP_printk("dev %s ino %lu mode 0%o block %llu count %lu flags %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->mode, __entry->block, __entry->count,
                  __entry->flags)
 );
@@ -614,7 +656,8 @@ TRACE_EVENT(ext4_sync_file,
        TP_ARGS(file, datasync),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        ino_t,  parent                  )
                __field(        int,    datasync                )
@@ -623,14 +666,16 @@ TRACE_EVENT(ext4_sync_file,
        TP_fast_assign(
                struct dentry *dentry = file->f_path.dentry;
 
-               __entry->dev            = dentry->d_inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(dentry->d_inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(dentry->d_inode->i_sb->s_dev);
                __entry->ino            = dentry->d_inode->i_ino;
                __entry->datasync       = datasync;
                __entry->parent         = dentry->d_parent->d_inode->i_ino;
        ),
 
-       TP_printk("dev %s ino %ld parent %ld datasync %d ",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %ld parent %ld datasync %d ",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  (unsigned long) __entry->parent, __entry->datasync)
 );
 
@@ -640,18 +685,20 @@ TRACE_EVENT(ext4_sync_fs,
        TP_ARGS(sb, wait),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        int,    wait                    )
 
        ),
 
        TP_fast_assign(
-               __entry->dev    = sb->s_dev;
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
                __entry->wait   = wait;
        ),
 
-       TP_printk("dev %s wait %d", jbd2_dev_to_name(__entry->dev),
-                 __entry->wait)
+       TP_printk("dev %d,%d wait %d", __entry->dev_major,
+                 __entry->dev_minor, __entry->wait)
 );
 
 TRACE_EVENT(ext4_alloc_da_blocks,
@@ -660,21 +707,24 @@ TRACE_EVENT(ext4_alloc_da_blocks,
        TP_ARGS(inode),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field( unsigned int,  data_blocks     )
                __field( unsigned int,  meta_blocks     )
        ),
 
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
        ),
 
-       TP_printk("dev %s ino %lu data_blocks %u meta_blocks %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu data_blocks %u meta_blocks %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->data_blocks, __entry->meta_blocks)
 );
 
@@ -684,7 +734,8 @@ TRACE_EVENT(ext4_mballoc_alloc,
        TP_ARGS(ac),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        __u16,  found                   )
                __field(        __u16,  groups                  )
@@ -707,7 +758,8 @@ TRACE_EVENT(ext4_mballoc_alloc,
        ),
 
        TP_fast_assign(
-               __entry->dev            = ac->ac_inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(ac->ac_inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(ac->ac_inode->i_sb->s_dev);
                __entry->ino            = ac->ac_inode->i_ino;
                __entry->found          = ac->ac_found;
                __entry->flags          = ac->ac_flags;
@@ -729,10 +781,11 @@ TRACE_EVENT(ext4_mballoc_alloc,
                __entry->result_len     = ac->ac_f_ex.fe_len;
        ),
 
-       TP_printk("dev %s inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
+       TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
                  "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x "
                  "tail %u broken %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->goal_group, __entry->goal_start,
@@ -750,7 +803,8 @@ TRACE_EVENT(ext4_mballoc_prealloc,
        TP_ARGS(ac),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        __u32,  orig_logical            )
                __field(          int,  orig_start              )
@@ -763,7 +817,8 @@ TRACE_EVENT(ext4_mballoc_prealloc,
        ),
 
        TP_fast_assign(
-               __entry->dev            = ac->ac_inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(ac->ac_inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(ac->ac_inode->i_sb->s_dev);
                __entry->ino            = ac->ac_inode->i_ino;
                __entry->orig_logical   = ac->ac_o_ex.fe_logical;
                __entry->orig_start     = ac->ac_o_ex.fe_start;
@@ -775,8 +830,9 @@ TRACE_EVENT(ext4_mballoc_prealloc,
                __entry->result_len     = ac->ac_b_ex.fe_len;
        ),
 
-       TP_printk("dev %s inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->result_group, __entry->result_start,
@@ -784,46 +840,59 @@ TRACE_EVENT(ext4_mballoc_prealloc,
 );
 
 DECLARE_EVENT_CLASS(ext4__mballoc,
-       TP_PROTO(struct ext4_allocation_context *ac),
+       TP_PROTO(struct super_block *sb,
+                struct inode *inode,
+                ext4_group_t group,
+                ext4_grpblk_t start,
+                ext4_grpblk_t len),
 
-       TP_ARGS(ac),
+       TP_ARGS(sb, inode, group, start, len),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
-               __field(        __u32,  result_logical          )
                __field(          int,  result_start            )
                __field(        __u32,  result_group            )
                __field(          int,  result_len              )
        ),
 
        TP_fast_assign(
-               __entry->dev            = ac->ac_inode->i_sb->s_dev;
-               __entry->ino            = ac->ac_inode->i_ino;
-               __entry->result_logical = ac->ac_b_ex.fe_logical;
-               __entry->result_start   = ac->ac_b_ex.fe_start;
-               __entry->result_group   = ac->ac_b_ex.fe_group;
-               __entry->result_len     = ac->ac_b_ex.fe_len;
+               __entry->dev_major      = MAJOR(sb->s_dev);
+               __entry->dev_minor      = MINOR(sb->s_dev);
+               __entry->ino            = inode ? inode->i_ino : 0;
+               __entry->result_start   = start;
+               __entry->result_group   = group;
+               __entry->result_len     = len;
        ),
 
-       TP_printk("dev %s inode %lu extent %u/%d/%u@%u ",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d inode %lu extent %u/%d/%u ",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->result_group, __entry->result_start,
-                 __entry->result_len, __entry->result_logical)
+                 __entry->result_len)
 );
 
 DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard,
 
-       TP_PROTO(struct ext4_allocation_context *ac),
+       TP_PROTO(struct super_block *sb,
+                struct inode *inode,
+                ext4_group_t group,
+                ext4_grpblk_t start,
+                ext4_grpblk_t len),
 
-       TP_ARGS(ac)
+       TP_ARGS(sb, inode, group, start, len)
 );
 
 DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free,
 
-       TP_PROTO(struct ext4_allocation_context *ac),
+       TP_PROTO(struct super_block *sb,
+                struct inode *inode,
+                ext4_group_t group,
+                ext4_grpblk_t start,
+                ext4_grpblk_t len),
 
-       TP_ARGS(ac)
+       TP_ARGS(sb, inode, group, start, len)
 );
 
 TRACE_EVENT(ext4_forget,
@@ -832,7 +901,8 @@ TRACE_EVENT(ext4_forget,
        TP_ARGS(inode, is_metadata, block),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        umode_t, mode                   )
                __field(        int,    is_metadata             )
@@ -840,16 +910,18 @@ TRACE_EVENT(ext4_forget,
        ),
 
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->mode   = inode->i_mode;
                __entry->is_metadata = is_metadata;
                __entry->block  = block;
        ),
 
-       TP_printk("dev %s ino %lu mode 0%o is_metadata %d block %llu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->mode, __entry->is_metadata, __entry->block)
+       TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->mode,
+                 __entry->is_metadata, __entry->block)
 );
 
 TRACE_EVENT(ext4_da_update_reserve_space,
@@ -858,7 +930,8 @@ TRACE_EVENT(ext4_da_update_reserve_space,
        TP_ARGS(inode, used_blocks),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        umode_t, mode                   )
                __field(        __u64,  i_blocks                )
@@ -869,7 +942,8 @@ TRACE_EVENT(ext4_da_update_reserve_space,
        ),
 
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->mode   = inode->i_mode;
                __entry->i_blocks = inode->i_blocks;
@@ -879,9 +953,10 @@ TRACE_EVENT(ext4_da_update_reserve_space,
                __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
        ),
 
-       TP_printk("dev %s ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->mode,  (unsigned long long) __entry->i_blocks,
+       TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->mode,
+                 (unsigned long long) __entry->i_blocks,
                  __entry->used_blocks, __entry->reserved_data_blocks,
                  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
 );
@@ -892,7 +967,8 @@ TRACE_EVENT(ext4_da_reserve_space,
        TP_ARGS(inode, md_needed),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        umode_t, mode                   )
                __field(        __u64,  i_blocks                )
@@ -902,7 +978,8 @@ TRACE_EVENT(ext4_da_reserve_space,
        ),
 
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->mode   = inode->i_mode;
                __entry->i_blocks = inode->i_blocks;
@@ -911,8 +988,9 @@ TRACE_EVENT(ext4_da_reserve_space,
                __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
        ),
 
-       TP_printk("dev %s ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->mode, (unsigned long long) __entry->i_blocks,
                  __entry->md_needed, __entry->reserved_data_blocks,
                  __entry->reserved_meta_blocks)
@@ -924,7 +1002,8 @@ TRACE_EVENT(ext4_da_release_space,
        TP_ARGS(inode, freed_blocks),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
                __field(        umode_t, mode                   )
                __field(        __u64,  i_blocks                )
@@ -935,7 +1014,8 @@ TRACE_EVENT(ext4_da_release_space,
        ),
 
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
                __entry->mode   = inode->i_mode;
                __entry->i_blocks = inode->i_blocks;
@@ -945,8 +1025,9 @@ TRACE_EVENT(ext4_da_release_space,
                __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
        ),
 
-       TP_printk("dev %s ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                  __entry->mode, (unsigned long long) __entry->i_blocks,
                  __entry->freed_blocks, __entry->reserved_data_blocks,
                  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
@@ -958,18 +1039,20 @@ DECLARE_EVENT_CLASS(ext4__bitmap_load,
        TP_ARGS(sb, group),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        __u32,  group                   )
 
        ),
 
        TP_fast_assign(
-               __entry->dev    = sb->s_dev;
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
                __entry->group  = group;
        ),
 
-       TP_printk("dev %s group %u",
-                 jbd2_dev_to_name(__entry->dev), __entry->group)
+       TP_printk("dev %d,%d group %u",
+                 __entry->dev_major, __entry->dev_minor, __entry->group)
 );
 
 DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load,
index bf16545..7447ea9 100644 (file)
@@ -17,17 +17,19 @@ TRACE_EVENT(jbd2_checkpoint,
        TP_ARGS(journal, result),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,    dev_major               )
+               __field(        int,    dev_minor               )
                __field(        int,    result                  )
        ),
 
        TP_fast_assign(
-               __entry->dev            = journal->j_fs_dev->bd_dev;
+               __entry->dev_major      = MAJOR(journal->j_fs_dev->bd_dev);
+               __entry->dev_minor      = MINOR(journal->j_fs_dev->bd_dev);
                __entry->result         = result;
        ),
 
-       TP_printk("dev %s result %d",
-                 jbd2_dev_to_name(__entry->dev), __entry->result)
+       TP_printk("dev %d,%d result %d",
+                 __entry->dev_major, __entry->dev_minor, __entry->result)
 );
 
 DECLARE_EVENT_CLASS(jbd2_commit,
@@ -37,20 +39,22 @@ DECLARE_EVENT_CLASS(jbd2_commit,
        TP_ARGS(journal, commit_transaction),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        char,   sync_commit               )
                __field(        int,    transaction               )
        ),
 
        TP_fast_assign(
-               __entry->dev            = journal->j_fs_dev->bd_dev;
+               __entry->dev_major      = MAJOR(journal->j_fs_dev->bd_dev);
+               __entry->dev_minor      = MINOR(journal->j_fs_dev->bd_dev);
                __entry->sync_commit = commit_transaction->t_synchronous_commit;
                __entry->transaction    = commit_transaction->t_tid;
        ),
 
-       TP_printk("dev %s transaction %d sync %d",
-                 jbd2_dev_to_name(__entry->dev), __entry->transaction,
-                 __entry->sync_commit)
+       TP_printk("dev %d,%d transaction %d sync %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 __entry->transaction, __entry->sync_commit)
 );
 
 DEFINE_EVENT(jbd2_commit, jbd2_start_commit,
@@ -87,22 +91,24 @@ TRACE_EVENT(jbd2_end_commit,
        TP_ARGS(journal, commit_transaction),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        char,   sync_commit               )
                __field(        int,    transaction               )
                __field(        int,    head                      )
        ),
 
        TP_fast_assign(
-               __entry->dev            = journal->j_fs_dev->bd_dev;
+               __entry->dev_major      = MAJOR(journal->j_fs_dev->bd_dev);
+               __entry->dev_minor      = MINOR(journal->j_fs_dev->bd_dev);
                __entry->sync_commit = commit_transaction->t_synchronous_commit;
                __entry->transaction    = commit_transaction->t_tid;
                __entry->head           = journal->j_tail_sequence;
        ),
 
-       TP_printk("dev %s transaction %d sync %d head %d",
-                 jbd2_dev_to_name(__entry->dev), __entry->transaction,
-                 __entry->sync_commit, __entry->head)
+       TP_printk("dev %d,%d transaction %d sync %d head %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 __entry->transaction, __entry->sync_commit, __entry->head)
 );
 
 TRACE_EVENT(jbd2_submit_inode_data,
@@ -111,17 +117,20 @@ TRACE_EVENT(jbd2_submit_inode_data,
        TP_ARGS(inode),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        ino_t,  ino                     )
        ),
 
        TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                __entry->ino    = inode->i_ino;
        ),
 
-       TP_printk("dev %s ino %lu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino)
+       TP_printk("dev %d,%d ino %lu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino)
 );
 
 TRACE_EVENT(jbd2_run_stats,
@@ -131,7 +140,8 @@ TRACE_EVENT(jbd2_run_stats,
        TP_ARGS(dev, tid, stats),
 
        TP_STRUCT__entry(
-               __field(                dev_t,  dev             )
+               __field(                  int,  dev_major       )
+               __field(                  int,  dev_minor       )
                __field(        unsigned long,  tid             )
                __field(        unsigned long,  wait            )
                __field(        unsigned long,  running         )
@@ -144,7 +154,8 @@ TRACE_EVENT(jbd2_run_stats,
        ),
 
        TP_fast_assign(
-               __entry->dev            = dev;
+               __entry->dev_major      = MAJOR(dev);
+               __entry->dev_minor      = MINOR(dev);
                __entry->tid            = tid;
                __entry->wait           = stats->rs_wait;
                __entry->running        = stats->rs_running;
@@ -156,9 +167,9 @@ TRACE_EVENT(jbd2_run_stats,
                __entry->blocks_logged  = stats->rs_blocks_logged;
        ),
 
-       TP_printk("dev %s tid %lu wait %u running %u locked %u flushing %u "
+       TP_printk("dev %d,%d tid %lu wait %u running %u locked %u flushing %u "
                  "logging %u handle_count %u blocks %u blocks_logged %u",
-                 jbd2_dev_to_name(__entry->dev), __entry->tid,
+                 __entry->dev_major, __entry->dev_minor, __entry->tid,
                  jiffies_to_msecs(__entry->wait),
                  jiffies_to_msecs(__entry->running),
                  jiffies_to_msecs(__entry->locked),
@@ -175,7 +186,8 @@ TRACE_EVENT(jbd2_checkpoint_stats,
        TP_ARGS(dev, tid, stats),
 
        TP_STRUCT__entry(
-               __field(                dev_t,  dev             )
+               __field(                  int,  dev_major       )
+               __field(                  int,  dev_minor       )
                __field(        unsigned long,  tid             )
                __field(        unsigned long,  chp_time        )
                __field(                __u32,  forced_to_close )
@@ -184,7 +196,8 @@ TRACE_EVENT(jbd2_checkpoint_stats,
        ),
 
        TP_fast_assign(
-               __entry->dev            = dev;
+               __entry->dev_major      = MAJOR(dev);
+               __entry->dev_minor      = MINOR(dev);
                __entry->tid            = tid;
                __entry->chp_time       = stats->cs_chp_time;
                __entry->forced_to_close= stats->cs_forced_to_close;
@@ -192,9 +205,9 @@ TRACE_EVENT(jbd2_checkpoint_stats,
                __entry->dropped        = stats->cs_dropped;
        ),
 
-       TP_printk("dev %s tid %lu chp_time %u forced_to_close %u "
+       TP_printk("dev %d,%d tid %lu chp_time %u forced_to_close %u "
                  "written %u dropped %u",
-                 jbd2_dev_to_name(__entry->dev), __entry->tid,
+                 __entry->dev_major, __entry->dev_minor, __entry->tid,
                  jiffies_to_msecs(__entry->chp_time),
                  __entry->forced_to_close, __entry->written, __entry->dropped)
 );
@@ -207,7 +220,8 @@ TRACE_EVENT(jbd2_cleanup_journal_tail,
        TP_ARGS(journal, first_tid, block_nr, freed),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                __field(        tid_t,  tail_sequence           )
                __field(        tid_t,  first_tid               )
                __field(unsigned long,  block_nr                )
@@ -215,16 +229,18 @@ TRACE_EVENT(jbd2_cleanup_journal_tail,
        ),
 
        TP_fast_assign(
-               __entry->dev            = journal->j_fs_dev->bd_dev;
+               __entry->dev_major      = MAJOR(journal->j_fs_dev->bd_dev);
+               __entry->dev_minor      = MINOR(journal->j_fs_dev->bd_dev);
                __entry->tail_sequence  = journal->j_tail_sequence;
                __entry->first_tid      = first_tid;
                __entry->block_nr       = block_nr;
                __entry->freed          = freed;
        ),
 
-       TP_printk("dev %s from %u to %u offset %lu freed %lu",
-                 jbd2_dev_to_name(__entry->dev), __entry->tail_sequence,
-                 __entry->first_tid, __entry->block_nr, __entry->freed)
+       TP_printk("dev %d,%d from %u to %u offset %lu freed %lu",
+                 __entry->dev_major, __entry->dev_minor,
+                 __entry->tail_sequence, __entry->first_tid,
+                 __entry->block_nr, __entry->freed)
 );
 
 #endif /* _TRACE_JBD2_H */