ext4: Introduce FALLOC_FL_ZERO_RANGE flag for fallocate
[pandora-kernel.git] / fs / ext4 / extents.c
index 10cff47..464e95d 100644 (file)
@@ -37,7 +37,6 @@
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <linux/falloc.h>
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
 #include "ext4_jbd2.h"
@@ -1691,7 +1690,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
         * the extent that was written properly split out and conversion to
         * initialized is trivial.
         */
-       if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
+       if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2))
                return 0;
 
        ext1_ee_len = ext4_ext_get_actual_len(ex1);
@@ -1708,6 +1707,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
         */
        if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
                return 0;
+       if (ext4_ext_is_uninitialized(ex1) &&
+           (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
+            atomic_read(&EXT4_I(inode)->i_unwritten) ||
+            (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN)))
+               return 0;
 #ifdef AGGRESSIVE_TEST
        if (ext1_ee_len >= 4)
                return 0;
@@ -1731,7 +1735,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
 {
        struct ext4_extent_header *eh;
        unsigned int depth, len;
-       int merge_done = 0;
+       int merge_done = 0, uninit;
 
        depth = ext_depth(inode);
        BUG_ON(path[depth].p_hdr == NULL);
@@ -1741,8 +1745,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
                if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
                        break;
                /* merge with next extent! */
+               uninit = ext4_ext_is_uninitialized(ex);
                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                + ext4_ext_get_actual_len(ex + 1));
+               if (uninit)
+                       ext4_ext_mark_uninitialized(ex);
 
                if (ex + 1 < EXT_LAST_EXTENT(eh)) {
                        len = (EXT_LAST_EXTENT(eh) - ex - 1)
@@ -1896,7 +1903,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        struct ext4_ext_path *npath = NULL;
        int depth, len, err;
        ext4_lblk_t next;
-       int mb_flags = 0;
+       int mb_flags = 0, uninit;
 
        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1946,9 +1953,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
                                                  path + depth);
                        if (err)
                                return err;
-
+                       uninit = ext4_ext_is_uninitialized(ex);
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
+                       if (uninit)
+                               ext4_ext_mark_uninitialized(ex);
                        eh = path[depth].p_hdr;
                        nearex = ex;
                        goto merge;
@@ -1971,10 +1980,13 @@ prepend:
                        if (err)
                                return err;
 
+                       uninit = ext4_ext_is_uninitialized(ex);
                        ex->ee_block = newext->ee_block;
                        ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
+                       if (uninit)
+                               ext4_ext_mark_uninitialized(ex);
                        eh = path[depth].p_hdr;
                        nearex = ex;
                        goto merge;
@@ -2585,6 +2597,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        ex_ee_block = le32_to_cpu(ex->ee_block);
        ex_ee_len = ext4_ext_get_actual_len(ex);
 
+       /*
+        * If we're starting with an extent other than the last one in the
+        * node, we need to see if it shares a cluster with the extent to
+        * the right (towards the end of the file). If its leftmost cluster
+        * is this extent's rightmost cluster and it is not cluster aligned,
+        * we'll mark it as a partial that is not to be deallocated.
+        */
+
+       if (ex != EXT_LAST_EXTENT(eh)) {
+               ext4_fsblk_t current_pblk, right_pblk;
+               long long current_cluster, right_cluster;
+
+               current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
+               current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
+               right_pblk = ext4_ext_pblock(ex + 1);
+               right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
+               if (current_cluster == right_cluster &&
+                       EXT4_PBLK_COFF(sbi, right_pblk))
+                       *partial_cluster = -right_cluster;
+       }
+
        trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
 
        while (ex >= EXT_FIRST_EXTENT(eh) &&
@@ -3569,6 +3602,8 @@ out:
  *   b> Splits in two extents: Write is happening at either end of the extent
  *   c> Splits in three extents: Somone is writing in middle of the extent
  *
+ * This works the same way in the case of initialized -> unwritten conversion.
+ *
  * One of more index blocks maybe needed if the extent tree grow after
  * the uninitialized extent split. To prevent ENOSPC occur at the IO
  * complete, we need to split the uninitialized extent before DIO submit
@@ -3579,7 +3614,7 @@ out:
  *
  * Returns the size of uninitialized extent to be written on success.
  */
-static int ext4_split_unwritten_extents(handle_t *handle,
+static int ext4_split_convert_extents(handle_t *handle,
                                        struct inode *inode,
                                        struct ext4_map_blocks *map,
                                        struct ext4_ext_path *path,
@@ -3591,9 +3626,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        unsigned int ee_len;
        int split_flag = 0, depth;
 
-       ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
-               "block %llu, max_blocks %u\n", inode->i_ino,
-               (unsigned long long)map->m_lblk, map->m_len);
+       ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
+                 __func__, inode->i_ino,
+                 (unsigned long long)map->m_lblk, map->m_len);
 
        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
                inode->i_sb->s_blocksize_bits;
@@ -3608,14 +3643,73 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
 
-       split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
-       split_flag |= EXT4_EXT_MARK_UNINIT2;
-       if (flags & EXT4_GET_BLOCKS_CONVERT)
-               split_flag |= EXT4_EXT_DATA_VALID2;
+       /* Convert to unwritten */
+       if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+               split_flag |= EXT4_EXT_DATA_VALID1;
+       /* Convert to initialized */
+       } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+               split_flag |= ee_block + ee_len <= eof_block ?
+                             EXT4_EXT_MAY_ZEROOUT : 0;
+               split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2);
+       }
        flags |= EXT4_GET_BLOCKS_PRE_IO;
        return ext4_split_extent(handle, inode, path, map, split_flag, flags);
 }
 
+static int ext4_convert_initialized_extents(handle_t *handle,
+                                           struct inode *inode,
+                                           struct ext4_map_blocks *map,
+                                           struct ext4_ext_path *path)
+{
+       struct ext4_extent *ex;
+       ext4_lblk_t ee_block;
+       unsigned int ee_len;
+       int depth;
+       int err = 0;
+
+       depth = ext_depth(inode);
+       ex = path[depth].p_ext;
+       ee_block = le32_to_cpu(ex->ee_block);
+       ee_len = ext4_ext_get_actual_len(ex);
+
+       ext_debug("%s: inode %lu, logical"
+               "block %llu, max_blocks %u\n", __func__, inode->i_ino,
+                 (unsigned long long)ee_block, ee_len);
+
+       if (ee_block != map->m_lblk || ee_len > map->m_len) {
+               err = ext4_split_convert_extents(handle, inode, map, path,
+                               EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+               if (err < 0)
+                       goto out;
+               ext4_ext_drop_refs(path);
+               path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+               if (IS_ERR(path)) {
+                       err = PTR_ERR(path);
+                       goto out;
+               }
+               depth = ext_depth(inode);
+               ex = path[depth].p_ext;
+       }
+
+       err = ext4_ext_get_access(handle, inode, path + depth);
+       if (err)
+               goto out;
+       /* first mark the extent as uninitialized */
+       ext4_ext_mark_uninitialized(ex);
+
+       /* note: ext4_ext_correct_indexes() isn't needed here because
+        * borders are not changed
+        */
+       ext4_ext_try_to_merge(handle, inode, path, ex);
+
+       /* Mark modified extent as dirty */
+       err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+       ext4_ext_show_leaf(inode, path);
+       return err;
+}
+
+
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                                struct inode *inode,
                                                struct ext4_map_blocks *map,
@@ -3649,8 +3743,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                             inode->i_ino, (unsigned long long)ee_block, ee_len,
                             (unsigned long long)map->m_lblk, map->m_len);
 #endif
-               err = ext4_split_unwritten_extents(handle, inode, map, path,
-                                                  EXT4_GET_BLOCKS_CONVERT);
+               err = ext4_split_convert_extents(handle, inode, map, path,
+                                                EXT4_GET_BLOCKS_CONVERT);
                if (err < 0)
                        goto out;
                ext4_ext_drop_refs(path);
@@ -3850,6 +3944,38 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
        return allocated_clusters;
 }
 
+static int
+ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
+                       struct ext4_map_blocks *map,
+                       struct ext4_ext_path *path, int flags,
+                       unsigned int allocated, ext4_fsblk_t newblock)
+{
+       int ret = 0;
+       int err = 0;
+
+       /*
+        * Make sure that the extent is no bigger than we support with
+        * uninitialized extent
+        */
+       if (map->m_len > EXT_UNINIT_MAX_LEN)
+               map->m_len = EXT_UNINIT_MAX_LEN / 2;
+
+       ret = ext4_convert_initialized_extents(handle, inode, map,
+                                               path);
+       if (ret >= 0) {
+               ext4_update_inode_fsync_trans(handle, inode, 1);
+               err = check_eofblocks_fl(handle, inode, map->m_lblk,
+                                        path, map->m_len);
+       } else
+               err = ret;
+       map->m_flags |= EXT4_MAP_UNWRITTEN;
+       if (allocated > map->m_len)
+               allocated = map->m_len;
+       map->m_len = allocated;
+
+       return err ? err : allocated;
+}
+
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
@@ -3877,8 +4003,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
        /* get_block() before submit the IO, split the extent */
        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-               ret = ext4_split_unwritten_extents(handle, inode, map,
-                                                  path, flags);
+               ret = ext4_split_convert_extents(handle, inode, map,
+                                        path, flags | EXT4_GET_BLOCKS_CONVERT);
                if (ret <= 0)
                        goto out;
                /*
@@ -3906,6 +4032,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                } else
                        err = ret;
                map->m_flags |= EXT4_MAP_MAPPED;
+               map->m_pblk = newblock;
                if (allocated > map->m_len)
                        allocated = map->m_len;
                map->m_len = allocated;
@@ -3992,10 +4119,6 @@ out1:
        map->m_pblk = newblock;
        map->m_len = allocated;
 out2:
-       if (path) {
-               ext4_ext_drop_refs(path);
-               kfree(path);
-       }
        return err ? err : allocated;
 }
 
@@ -4127,7 +4250,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        struct ext4_extent newex, *ex, *ex2;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_fsblk_t newblock = 0;
-       int free_on_err = 0, err = 0, depth;
+       int free_on_err = 0, err = 0, depth, ret;
        unsigned int allocated = 0, offset = 0;
        unsigned int allocated_clusters = 0;
        struct ext4_allocation_request ar;
@@ -4169,6 +4292,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;
 
+
                /*
                 * Uninitialized extents are treated as holes, except that
                 * we split out initialized portions during a write.
@@ -4185,13 +4309,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
                                  ee_block, ee_len, newblock);
 
-                       if (!ext4_ext_is_uninitialized(ex))
+                       /*
+                        * If the extent is initialized check whether the
+                        * caller wants to convert it to unwritten.
+                        */
+                       if ((!ext4_ext_is_uninitialized(ex)) &&
+                           (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
+                               allocated = ext4_ext_convert_initialized_extent(
+                                               handle, inode, map, path, flags,
+                                               allocated, newblock);
+                               goto out2;
+                       } else if (!ext4_ext_is_uninitialized(ex))
                                goto out;
 
-                       allocated = ext4_ext_handle_uninitialized_extents(
+                       ret = ext4_ext_handle_uninitialized_extents(
                                handle, inode, map, path, flags,
                                allocated, newblock);
-                       goto out3;
+                       if (ret < 0)
+                               err = ret;
+                       else
+                               allocated = ret;
+                       goto out2;
                }
        }
 
@@ -4472,7 +4610,6 @@ out2:
                kfree(path);
        }
 
-out3:
        trace_ext4_ext_map_blocks_exit(inode, flags, map,
                                       err ? err : allocated);
        ext4_es_lru_add(inode);
@@ -4513,34 +4650,200 @@ retry:
        ext4_std_error(inode->i_sb, err);
 }
 
-static void ext4_falloc_update_inode(struct inode *inode,
-                               int mode, loff_t new_size, int update_ctime)
+static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
+                                 ext4_lblk_t len, int flags, int mode)
 {
-       struct timespec now;
+       struct inode *inode = file_inode(file);
+       handle_t *handle;
+       int ret = 0;
+       int ret2 = 0;
+       int retries = 0;
+       struct ext4_map_blocks map;
+       unsigned int credits;
 
-       if (update_ctime) {
-               now = current_fs_time(inode->i_sb);
-               if (!timespec_equal(&inode->i_ctime, &now))
-                       inode->i_ctime = now;
+       map.m_lblk = offset;
+       /*
+        * Don't normalize the request if it can fit in one extent so
+        * that it doesn't get unnecessarily split into multiple
+        * extents.
+        */
+       if (len <= EXT_UNINIT_MAX_LEN)
+               flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+
+       /*
+        * credits to insert 1 extent into extent tree
+        */
+       credits = ext4_chunk_trans_blocks(inode, len);
+
+retry:
+       while (ret >= 0 && ret < len) {
+               map.m_lblk = map.m_lblk + ret;
+               map.m_len = len = len - ret;
+               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+                                           credits);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       break;
+               }
+               ret = ext4_map_blocks(handle, inode, &map, flags);
+               if (ret <= 0) {
+                       ext4_debug("inode #%lu: block %u: len %u: "
+                                  "ext4_ext_map_blocks returned %d",
+                                  inode->i_ino, map.m_lblk,
+                                  map.m_len, ret);
+                       ext4_mark_inode_dirty(handle, inode);
+                       ret2 = ext4_journal_stop(handle);
+                       break;
+               }
+               ret2 = ext4_journal_stop(handle);
+               if (ret2)
+                       break;
+       }
+       if (ret == -ENOSPC &&
+                       ext4_should_retry_alloc(inode->i_sb, &retries)) {
+               ret = 0;
+               goto retry;
        }
+
+       return ret > 0 ? ret2 : ret;
+}
+
+static long ext4_zero_range(struct file *file, loff_t offset,
+                           loff_t len, int mode)
+{
+       struct inode *inode = file_inode(file);
+       handle_t *handle = NULL;
+       unsigned int max_blocks;
+       loff_t new_size = 0;
+       int ret = 0;
+       int flags;
+       int partial;
+       loff_t start, end;
+       ext4_lblk_t lblk;
+       struct address_space *mapping = inode->i_mapping;
+       unsigned int blkbits = inode->i_blkbits;
+
+       trace_ext4_zero_range(inode, offset, len, mode);
+
+       /*
+        * Write out all dirty pages to avoid race conditions
+        * Then release them.
+        */
+       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+               ret = filemap_write_and_wait_range(mapping, offset,
+                                                  offset + len - 1);
+               if (ret)
+                       return ret;
+       }
+
        /*
-        * Update only when preallocation was requested beyond
-        * the file size.
+        * Round up offset. This is not fallocate, we neet to zero out
+        * blocks, so convert interior block aligned part of the range to
+        * unwritten and possibly manually zero out unaligned parts of the
+        * range.
         */
-       if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+       start = round_up(offset, 1 << blkbits);
+       end = round_down((offset + len), 1 << blkbits);
+
+       if (start < offset || end > offset + len)
+               return -EINVAL;
+       partial = (offset + len) & ((1 << blkbits) - 1);
+
+       lblk = start >> blkbits;
+       max_blocks = (end >> blkbits);
+       if (max_blocks < lblk)
+               max_blocks = 0;
+       else
+               max_blocks -= lblk;
+
+       flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+               EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+       if (mode & FALLOC_FL_KEEP_SIZE)
+               flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+       mutex_lock(&inode->i_mutex);
+
+       /*
+        * Indirect files do not support unwritten extnets
+        */
+       if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+               ret = -EOPNOTSUPP;
+               goto out_mutex;
+       }
+
+       if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+               new_size = offset + len;
+               ret = inode_newsize_ok(inode, new_size);
+               if (ret)
+                       goto out_mutex;
+               /*
+                * If we have a partial block after EOF we have to allocate
+                * the entire block.
+                */
+               if (partial)
+                       max_blocks += 1;
+       }
+
+       if (max_blocks > 0) {
+
+               /* Now release the pages and zero block aligned part of pages*/
+               truncate_pagecache_range(inode, start, end - 1);
+
+               /* Wait all existing dio workers, newcomers will block on i_mutex */
+               ext4_inode_block_unlocked_dio(inode);
+               inode_dio_wait(inode);
+
+               /*
+                * Remove entire range from the extent status tree.
+                */
+               ret = ext4_es_remove_extent(inode, lblk, max_blocks);
+               if (ret)
+                       goto out_dio;
+
+               ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
+                                            mode);
+               if (ret)
+                       goto out_dio;
+       }
+
+       handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               ext4_std_error(inode->i_sb, ret);
+               goto out_dio;
+       }
+
+       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+
+       if (!ret && new_size) {
                if (new_size > i_size_read(inode))
                        i_size_write(inode, new_size);
                if (new_size > EXT4_I(inode)->i_disksize)
                        ext4_update_i_disksize(inode, new_size);
-       } else {
+       } else if (!ret && !new_size) {
                /*
-                * Mark that we allocate beyond EOF so the subsequent truncate
-                * can proceed even if the new size is the same as i_size.
-                */
-               if (new_size > i_size_read(inode))
+               * Mark that we allocate beyond EOF so the subsequent truncate
+               * can proceed even if the new size is the same as i_size.
+               */
+               if ((offset + len) > i_size_read(inode))
                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
 
+       ext4_mark_inode_dirty(handle, inode);
+
+       /* Zero out partial block at the edges of the range */
+       ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+
+       if (file->f_flags & O_SYNC)
+               ext4_handle_sync(handle);
+
+       ext4_journal_stop(handle);
+out_dio:
+       ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+       mutex_unlock(&inode->i_mutex);
+       return ret;
 }
 
 /*
@@ -4554,22 +4857,25 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
        struct inode *inode = file_inode(file);
        handle_t *handle;
-       loff_t new_size;
+       loff_t new_size = 0;
        unsigned int max_blocks;
        int ret = 0;
-       int ret2 = 0;
-       int retries = 0;
        int flags;
-       struct ext4_map_blocks map;
-       unsigned int credits, blkbits = inode->i_blkbits;
+       ext4_lblk_t lblk;
+       struct timespec tv;
+       unsigned int blkbits = inode->i_blkbits;
 
        /* Return error if mode is not supported */
-       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+                    FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
 
        if (mode & FALLOC_FL_PUNCH_HOLE)
                return ext4_punch_hole(inode, offset, len);
 
+       if (mode & FALLOC_FL_COLLAPSE_RANGE)
+               return ext4_collapse_range(inode, offset, len);
+
        ret = ext4_convert_inline_data(inode);
        if (ret)
                return ret;
@@ -4581,83 +4887,66 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
 
+       if (mode & FALLOC_FL_ZERO_RANGE)
+               return ext4_zero_range(file, offset, len, mode);
+
        trace_ext4_fallocate_enter(inode, offset, len, mode);
-       map.m_lblk = offset >> blkbits;
+       lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
         * If blocksize = 4096 offset = 3072 and len = 2048
         */
        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-               - map.m_lblk;
-       /*
-        * credits to insert 1 extent into extent tree
-        */
-       credits = ext4_chunk_trans_blocks(inode, max_blocks);
-       mutex_lock(&inode->i_mutex);
-       ret = inode_newsize_ok(inode, (len + offset));
-       if (ret) {
-               mutex_unlock(&inode->i_mutex);
-               trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
-               return ret;
-       }
+               - lblk;
+
        flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
        if (mode & FALLOC_FL_KEEP_SIZE)
                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
-       /*
-        * Don't normalize the request if it can fit in one extent so
-        * that it doesn't get unnecessarily split into multiple
-        * extents.
-        */
-       if (len <= EXT_UNINIT_MAX_LEN << blkbits)
-               flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 
-retry:
-       while (ret >= 0 && ret < max_blocks) {
-               map.m_lblk = map.m_lblk + ret;
-               map.m_len = max_blocks = max_blocks - ret;
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-                                           credits);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       break;
-               }
-               ret = ext4_map_blocks(handle, inode, &map, flags);
-               if (ret <= 0) {
-#ifdef EXT4FS_DEBUG
-                       ext4_warning(inode->i_sb,
-                                    "inode #%lu: block %u: len %u: "
-                                    "ext4_ext_map_blocks returned %d",
-                                    inode->i_ino, map.m_lblk,
-                                    map.m_len, ret);
-#endif
-                       ext4_mark_inode_dirty(handle, inode);
-                       ret2 = ext4_journal_stop(handle);
-                       break;
-               }
-               if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
-                                               blkbits) >> blkbits))
-                       new_size = offset + len;
-               else
-                       new_size = ((loff_t) map.m_lblk + ret) << blkbits;
+       mutex_lock(&inode->i_mutex);
 
-               ext4_falloc_update_inode(inode, mode, new_size,
-                                        (map.m_flags & EXT4_MAP_NEW));
-               ext4_mark_inode_dirty(handle, inode);
-               if ((file->f_flags & O_SYNC) && ret >= max_blocks)
-                       ext4_handle_sync(handle);
-               ret2 = ext4_journal_stop(handle);
-               if (ret2)
-                       break;
+       if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+               new_size = offset + len;
+               ret = inode_newsize_ok(inode, new_size);
+               if (ret)
+                       goto out;
        }
-       if (ret == -ENOSPC &&
-                       ext4_should_retry_alloc(inode->i_sb, &retries)) {
-               ret = 0;
-               goto retry;
+
+       ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
+       if (ret)
+               goto out;
+
+       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+       if (IS_ERR(handle))
+               goto out;
+
+       tv = inode->i_ctime = ext4_current_time(inode);
+
+       if (!ret && new_size) {
+               if (new_size > i_size_read(inode)) {
+                       i_size_write(inode, new_size);
+                       inode->i_mtime = tv;
+               }
+               if (new_size > EXT4_I(inode)->i_disksize)
+                       ext4_update_i_disksize(inode, new_size);
+       } else if (!ret && !new_size) {
+               /*
+               * Mark that we allocate beyond EOF so the subsequent truncate
+               * can proceed even if the new size is the same as i_size.
+               */
+               if ((offset + len) > i_size_read(inode))
+                       ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
+       ext4_mark_inode_dirty(handle, inode);
+       if (file->f_flags & O_SYNC)
+               ext4_handle_sync(handle);
+
+       ext4_journal_stop(handle);
+out:
        mutex_unlock(&inode->i_mutex);
-       trace_ext4_fallocate_exit(inode, offset, max_blocks,
-                               ret > 0 ? ret2 : ret);
-       return ret > 0 ? ret2 : ret;
+       trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+       return ret;
 }
 
 /*
@@ -4868,3 +5157,304 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        ext4_es_lru_add(inode);
        return error;
 }
+
+/*
+ * ext4_access_path:
+ * Function to access the path buffer for marking it dirty.
+ * It also checks if there are sufficient credits left in the journal handle
+ * to update path.
+ */
+static int
+ext4_access_path(handle_t *handle, struct inode *inode,
+               struct ext4_ext_path *path)
+{
+       int credits, err;
+
+       if (!ext4_handle_valid(handle))
+               return 0;
+
+       /*
+        * Check if need to extend journal credits
+        * 3 for leaf, sb, and inode plus 2 (bmap and group
+        * descriptor) for each block group; assume two block
+        * groups
+        */
+       if (handle->h_buffer_credits < 7) {
+               credits = ext4_writepage_trans_blocks(inode);
+               err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+               /* EAGAIN is success */
+               if (err && err != -EAGAIN)
+                       return err;
+       }
+
+       err = ext4_ext_get_access(handle, inode, path);
+       return err;
+}
+
+/*
+ * ext4_ext_shift_path_extents:
+ * Shift the extents of a path structure lying between path[depth].p_ext
+ * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
+ * from starting block for each extent.
+ */
+static int
+ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
+                           struct inode *inode, handle_t *handle,
+                           ext4_lblk_t *start)
+{
+       int depth, err = 0;
+       struct ext4_extent *ex_start, *ex_last;
+       bool update = 0;
+       depth = path->p_depth;
+
+       while (depth >= 0) {
+               if (depth == path->p_depth) {
+                       ex_start = path[depth].p_ext;
+                       if (!ex_start)
+                               return -EIO;
+
+                       ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+                       if (!ex_last)
+                               return -EIO;
+
+                       err = ext4_access_path(handle, inode, path + depth);
+                       if (err)
+                               goto out;
+
+                       if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
+                               update = 1;
+
+                       *start = ex_last->ee_block +
+                               ext4_ext_get_actual_len(ex_last);
+
+                       while (ex_start <= ex_last) {
+                               ex_start->ee_block -= shift;
+                               if (ex_start >
+                                       EXT_FIRST_EXTENT(path[depth].p_hdr)) {
+                                       if (ext4_ext_try_to_merge_right(inode,
+                                               path, ex_start - 1))
+                                               ex_last--;
+                               }
+                               ex_start++;
+                       }
+                       err = ext4_ext_dirty(handle, inode, path + depth);
+                       if (err)
+                               goto out;
+
+                       if (--depth < 0 || !update)
+                               break;
+               }
+
+               /* Update index too */
+               err = ext4_access_path(handle, inode, path + depth);
+               if (err)
+                       goto out;
+
+               path[depth].p_idx->ei_block -= shift;
+               err = ext4_ext_dirty(handle, inode, path + depth);
+               if (err)
+                       goto out;
+
+               /* we are done if current index is not a starting index */
+               if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
+                       break;
+
+               depth--;
+       }
+
+out:
+       return err;
+}
+
+/*
+ * ext4_ext_shift_extents:
+ * All the extents which lies in the range from start to the last allocated
+ * block for the file are shifted downwards by shift blocks.
+ * On success, 0 is returned, error otherwise.
+ */
+static int
+ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
+                      ext4_lblk_t start, ext4_lblk_t shift)
+{
+       struct ext4_ext_path *path;
+       int ret = 0, depth;
+       struct ext4_extent *extent;
+       ext4_lblk_t stop_block, current_block;
+       ext4_lblk_t ex_start, ex_end;
+
+       /* Let path point to the last extent */
+       path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+       if (IS_ERR(path))
+               return PTR_ERR(path);
+
+       depth = path->p_depth;
+       extent = path[depth].p_ext;
+       if (!extent) {
+               ext4_ext_drop_refs(path);
+               kfree(path);
+               return ret;
+       }
+
+       stop_block = extent->ee_block + ext4_ext_get_actual_len(extent);
+       ext4_ext_drop_refs(path);
+       kfree(path);
+
+       /* Nothing to shift, if hole is at the end of file */
+       if (start >= stop_block)
+               return ret;
+
+       /*
+        * Don't start shifting extents until we make sure the hole is big
+        * enough to accomodate the shift.
+        */
+       path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+       depth = path->p_depth;
+       extent =  path[depth].p_ext;
+       ex_start = extent->ee_block;
+       ex_end = extent->ee_block + ext4_ext_get_actual_len(extent);
+       ext4_ext_drop_refs(path);
+       kfree(path);
+
+       if ((start == ex_start && shift > ex_start) ||
+           (shift > start - ex_end))
+               return -EINVAL;
+
+       /* Its safe to start updating extents */
+       while (start < stop_block) {
+               path = ext4_ext_find_extent(inode, start, NULL, 0);
+               if (IS_ERR(path))
+                       return PTR_ERR(path);
+               depth = path->p_depth;
+               extent = path[depth].p_ext;
+               current_block = extent->ee_block;
+               if (start > current_block) {
+                       /* Hole, move to the next extent */
+                       ret = mext_next_extent(inode, path, &extent);
+                       if (ret != 0) {
+                               ext4_ext_drop_refs(path);
+                               kfree(path);
+                               if (ret == 1)
+                                       ret = 0;
+                               break;
+                       }
+               }
+               ret = ext4_ext_shift_path_extents(path, shift, inode,
+                               handle, &start);
+               ext4_ext_drop_refs(path);
+               kfree(path);
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
+
+/*
+ * ext4_collapse_range:
+ * This implements the fallocate's collapse range functionality for ext4
+ * Returns: 0 and non-zero on error.
+ */
+int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+       struct super_block *sb = inode->i_sb;
+       ext4_lblk_t punch_start, punch_stop;
+       handle_t *handle;
+       unsigned int credits;
+       loff_t new_size;
+       int ret;
+
+       BUG_ON(offset + len > i_size_read(inode));
+
+       /* Collapse range works only on fs block size aligned offsets. */
+       if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
+           len & (EXT4_BLOCK_SIZE(sb) - 1))
+               return -EINVAL;
+
+       if (!S_ISREG(inode->i_mode))
+               return -EOPNOTSUPP;
+
+       trace_ext4_collapse_range(inode, offset, len);
+
+       punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+       punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+       /* Write out all dirty pages */
+       ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1);
+       if (ret)
+               return ret;
+
+       /* Take mutex lock */
+       mutex_lock(&inode->i_mutex);
+
+       /* It's not possible punch hole on append only file */
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+               ret = -EPERM;
+               goto out_mutex;
+       }
+
+       if (IS_SWAPFILE(inode)) {
+               ret = -ETXTBSY;
+               goto out_mutex;
+       }
+
+       /* Currently just for extent based files */
+       if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+               ret = -EOPNOTSUPP;
+               goto out_mutex;
+       }
+
+       truncate_pagecache_range(inode, offset, -1);
+
+       /* Wait for existing dio to complete */
+       ext4_inode_block_unlocked_dio(inode);
+       inode_dio_wait(inode);
+
+       credits = ext4_writepage_trans_blocks(inode);
+       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               goto out_dio;
+       }
+
+       down_write(&EXT4_I(inode)->i_data_sem);
+       ext4_discard_preallocations(inode);
+
+       ret = ext4_es_remove_extent(inode, punch_start,
+                                   EXT_MAX_BLOCKS - punch_start - 1);
+       if (ret) {
+               up_write(&EXT4_I(inode)->i_data_sem);
+               goto out_stop;
+       }
+
+       ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+       if (ret) {
+               up_write(&EXT4_I(inode)->i_data_sem);
+               goto out_stop;
+       }
+
+       ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+                                    punch_stop - punch_start);
+       if (ret) {
+               up_write(&EXT4_I(inode)->i_data_sem);
+               goto out_stop;
+       }
+
+       new_size = i_size_read(inode) - len;
+       truncate_setsize(inode, new_size);
+       EXT4_I(inode)->i_disksize = new_size;
+
+       ext4_discard_preallocations(inode);
+       up_write(&EXT4_I(inode)->i_data_sem);
+       if (IS_SYNC(inode))
+               ext4_handle_sync(handle);
+       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+       ext4_mark_inode_dirty(handle, inode);
+
+out_stop:
+       ext4_journal_stop(handle);
+out_dio:
+       ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+       mutex_unlock(&inode->i_mutex);
+       return ret;
+}