ext4: Introduce FALLOC_FL_ZERO_RANGE flag for fallocate
authorLukas Czerner <lczerner@redhat.com>
Tue, 18 Mar 2014 22:05:35 +0000 (18:05 -0400)
committerTheodore Ts'o <tytso@mit.edu>
Tue, 18 Mar 2014 22:05:35 +0000 (18:05 -0400)
Introduce new FALLOC_FL_ZERO_RANGE flag for fallocate. This has the same
functionality as xfs ioctl XFS_IOC_ZERO_RANGE.

It can be used to convert a range of file to zeros preferably without
issuing data IO. Blocks should be preallocated for the regions that span
holes in the file, and the entire range is preferable converted to
unwritten extents

This can be also used to preallocate blocks past EOF in the same way as
with fallocate. Flag FALLOC_FL_KEEP_SIZE which should cause the inode
size to remain the same.

Also add appropriate tracepoints.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
fs/ext4/ext4.h
fs/ext4/extents.c
fs/ext4/inode.c
include/trace/events/ext4.h

index beec427..1b3cbf8 100644 (file)
@@ -568,6 +568,8 @@ enum {
 #define EXT4_GET_BLOCKS_NO_LOCK                        0x0100
        /* Do not put hole in extent cache */
 #define EXT4_GET_BLOCKS_NO_PUT_HOLE            0x0200
+       /* Convert written extents to unwritten */
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN      0x0400
 
 /*
  * The bit position of these flags must not overlap with any of the
index 2db2d77..464e95d 100644 (file)
@@ -3602,6 +3602,8 @@ out:
  *   b> Splits in two extents: Write is happening at either end of the extent
  *   c> Splits in three extents: Somone is writing in middle of the extent
  *
+ * This works the same way in the case of initialized -> unwritten conversion.
+ *
  * One of more index blocks maybe needed if the extent tree grow after
  * the uninitialized extent split. To prevent ENOSPC occur at the IO
  * complete, we need to split the uninitialized extent before DIO submit
@@ -3612,7 +3614,7 @@ out:
  *
  * Returns the size of uninitialized extent to be written on success.
  */
-static int ext4_split_unwritten_extents(handle_t *handle,
+static int ext4_split_convert_extents(handle_t *handle,
                                        struct inode *inode,
                                        struct ext4_map_blocks *map,
                                        struct ext4_ext_path *path,
@@ -3624,9 +3626,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        unsigned int ee_len;
        int split_flag = 0, depth;
 
-       ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
-               "block %llu, max_blocks %u\n", inode->i_ino,
-               (unsigned long long)map->m_lblk, map->m_len);
+       ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
+                 __func__, inode->i_ino,
+                 (unsigned long long)map->m_lblk, map->m_len);
 
        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
                inode->i_sb->s_blocksize_bits;
@@ -3641,14 +3643,73 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
 
-       split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
-       split_flag |= EXT4_EXT_MARK_UNINIT2;
-       if (flags & EXT4_GET_BLOCKS_CONVERT)
-               split_flag |= EXT4_EXT_DATA_VALID2;
+       /* Convert to unwritten */
+       if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+               split_flag |= EXT4_EXT_DATA_VALID1;
+       /* Convert to initialized */
+       } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+               split_flag |= ee_block + ee_len <= eof_block ?
+                             EXT4_EXT_MAY_ZEROOUT : 0;
+               split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2);
+       }
        flags |= EXT4_GET_BLOCKS_PRE_IO;
        return ext4_split_extent(handle, inode, path, map, split_flag, flags);
 }
 
+static int ext4_convert_initialized_extents(handle_t *handle,
+                                           struct inode *inode,
+                                           struct ext4_map_blocks *map,
+                                           struct ext4_ext_path *path)
+{
+       struct ext4_extent *ex;
+       ext4_lblk_t ee_block;
+       unsigned int ee_len;
+       int depth;
+       int err = 0;
+
+       depth = ext_depth(inode);
+       ex = path[depth].p_ext;
+       ee_block = le32_to_cpu(ex->ee_block);
+       ee_len = ext4_ext_get_actual_len(ex);
+
+       ext_debug("%s: inode %lu, logical"
+               "block %llu, max_blocks %u\n", __func__, inode->i_ino,
+                 (unsigned long long)ee_block, ee_len);
+
+       if (ee_block != map->m_lblk || ee_len > map->m_len) {
+               err = ext4_split_convert_extents(handle, inode, map, path,
+                               EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+               if (err < 0)
+                       goto out;
+               ext4_ext_drop_refs(path);
+               path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+               if (IS_ERR(path)) {
+                       err = PTR_ERR(path);
+                       goto out;
+               }
+               depth = ext_depth(inode);
+               ex = path[depth].p_ext;
+       }
+
+       err = ext4_ext_get_access(handle, inode, path + depth);
+       if (err)
+               goto out;
+       /* first mark the extent as uninitialized */
+       ext4_ext_mark_uninitialized(ex);
+
+       /* note: ext4_ext_correct_indexes() isn't needed here because
+        * borders are not changed
+        */
+       ext4_ext_try_to_merge(handle, inode, path, ex);
+
+       /* Mark modified extent as dirty */
+       err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+       ext4_ext_show_leaf(inode, path);
+       return err;
+}
+
+
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                                struct inode *inode,
                                                struct ext4_map_blocks *map,
@@ -3682,8 +3743,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                             inode->i_ino, (unsigned long long)ee_block, ee_len,
                             (unsigned long long)map->m_lblk, map->m_len);
 #endif
-               err = ext4_split_unwritten_extents(handle, inode, map, path,
-                                                  EXT4_GET_BLOCKS_CONVERT);
+               err = ext4_split_convert_extents(handle, inode, map, path,
+                                                EXT4_GET_BLOCKS_CONVERT);
                if (err < 0)
                        goto out;
                ext4_ext_drop_refs(path);
@@ -3883,6 +3944,38 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
        return allocated_clusters;
 }
 
+static int
+ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
+                       struct ext4_map_blocks *map,
+                       struct ext4_ext_path *path, int flags,
+                       unsigned int allocated, ext4_fsblk_t newblock)
+{
+       int ret = 0;
+       int err = 0;
+
+       /*
+        * Make sure that the extent is no bigger than we support with
+        * uninitialized extent
+        */
+       if (map->m_len > EXT_UNINIT_MAX_LEN)
+               map->m_len = EXT_UNINIT_MAX_LEN / 2;
+
+       ret = ext4_convert_initialized_extents(handle, inode, map,
+                                               path);
+       if (ret >= 0) {
+               ext4_update_inode_fsync_trans(handle, inode, 1);
+               err = check_eofblocks_fl(handle, inode, map->m_lblk,
+                                        path, map->m_len);
+       } else
+               err = ret;
+       map->m_flags |= EXT4_MAP_UNWRITTEN;
+       if (allocated > map->m_len)
+               allocated = map->m_len;
+       map->m_len = allocated;
+
+       return err ? err : allocated;
+}
+
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
@@ -3910,8 +4003,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
        /* get_block() before submit the IO, split the extent */
        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-               ret = ext4_split_unwritten_extents(handle, inode, map,
-                                                  path, flags);
+               ret = ext4_split_convert_extents(handle, inode, map,
+                                        path, flags | EXT4_GET_BLOCKS_CONVERT);
                if (ret <= 0)
                        goto out;
                /*
@@ -4199,6 +4292,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;
 
+
                /*
                 * Uninitialized extents are treated as holes, except that
                 * we split out initialized portions during a write.
@@ -4215,7 +4309,17 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
                                  ee_block, ee_len, newblock);
 
-                       if (!ext4_ext_is_uninitialized(ex))
+                       /*
+                        * If the extent is initialized check whether the
+                        * caller wants to convert it to unwritten.
+                        */
+                       if ((!ext4_ext_is_uninitialized(ex)) &&
+                           (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
+                               allocated = ext4_ext_convert_initialized_extent(
+                                               handle, inode, map, path, flags,
+                                               allocated, newblock);
+                               goto out2;
+                       } else if (!ext4_ext_is_uninitialized(ex))
                                goto out;
 
                        ret = ext4_ext_handle_uninitialized_extents(
@@ -4604,6 +4708,144 @@ retry:
        return ret > 0 ? ret2 : ret;
 }
 
+static long ext4_zero_range(struct file *file, loff_t offset,
+                           loff_t len, int mode)
+{
+       struct inode *inode = file_inode(file);
+       handle_t *handle = NULL;
+       unsigned int max_blocks;
+       loff_t new_size = 0;
+       int ret = 0;
+       int flags;
+       int partial;
+       loff_t start, end;
+       ext4_lblk_t lblk;
+       struct address_space *mapping = inode->i_mapping;
+       unsigned int blkbits = inode->i_blkbits;
+
+       trace_ext4_zero_range(inode, offset, len, mode);
+
+       /*
+        * Write out all dirty pages to avoid race conditions
+        * Then release them.
+        */
+       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+               ret = filemap_write_and_wait_range(mapping, offset,
+                                                  offset + len - 1);
+               if (ret)
+                       return ret;
+       }
+
+       /*
+        * Round up offset. This is not fallocate, we neet to zero out
+        * blocks, so convert interior block aligned part of the range to
+        * unwritten and possibly manually zero out unaligned parts of the
+        * range.
+        */
+       start = round_up(offset, 1 << blkbits);
+       end = round_down((offset + len), 1 << blkbits);
+
+       if (start < offset || end > offset + len)
+               return -EINVAL;
+       partial = (offset + len) & ((1 << blkbits) - 1);
+
+       lblk = start >> blkbits;
+       max_blocks = (end >> blkbits);
+       if (max_blocks < lblk)
+               max_blocks = 0;
+       else
+               max_blocks -= lblk;
+
+       flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+               EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+       if (mode & FALLOC_FL_KEEP_SIZE)
+               flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+       mutex_lock(&inode->i_mutex);
+
+       /*
+        * Indirect files do not support unwritten extnets
+        */
+       if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+               ret = -EOPNOTSUPP;
+               goto out_mutex;
+       }
+
+       if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+               new_size = offset + len;
+               ret = inode_newsize_ok(inode, new_size);
+               if (ret)
+                       goto out_mutex;
+               /*
+                * If we have a partial block after EOF we have to allocate
+                * the entire block.
+                */
+               if (partial)
+                       max_blocks += 1;
+       }
+
+       if (max_blocks > 0) {
+
+               /* Now release the pages and zero block aligned part of pages*/
+               truncate_pagecache_range(inode, start, end - 1);
+
+               /* Wait all existing dio workers, newcomers will block on i_mutex */
+               ext4_inode_block_unlocked_dio(inode);
+               inode_dio_wait(inode);
+
+               /*
+                * Remove entire range from the extent status tree.
+                */
+               ret = ext4_es_remove_extent(inode, lblk, max_blocks);
+               if (ret)
+                       goto out_dio;
+
+               ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
+                                            mode);
+               if (ret)
+                       goto out_dio;
+       }
+
+       handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               ext4_std_error(inode->i_sb, ret);
+               goto out_dio;
+       }
+
+       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+
+       if (!ret && new_size) {
+               if (new_size > i_size_read(inode))
+                       i_size_write(inode, new_size);
+               if (new_size > EXT4_I(inode)->i_disksize)
+                       ext4_update_i_disksize(inode, new_size);
+       } else if (!ret && !new_size) {
+               /*
+               * Mark that we allocate beyond EOF so the subsequent truncate
+               * can proceed even if the new size is the same as i_size.
+               */
+               if ((offset + len) > i_size_read(inode))
+                       ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+       }
+
+       ext4_mark_inode_dirty(handle, inode);
+
+       /* Zero out partial block at the edges of the range */
+       ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+
+       if (file->f_flags & O_SYNC)
+               ext4_handle_sync(handle);
+
+       ext4_journal_stop(handle);
+out_dio:
+       ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+       mutex_unlock(&inode->i_mutex);
+       return ret;
+}
+
 /*
  * preallocate space for a file. This implements ext4's fallocate file
  * operation, which gets called from sys_fallocate system call.
@@ -4625,7 +4867,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 
        /* Return error if mode is not supported */
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
-                    FALLOC_FL_COLLAPSE_RANGE))
+                    FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
 
        if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -4645,6 +4887,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
 
+       if (mode & FALLOC_FL_ZERO_RANGE)
+               return ext4_zero_range(file, offset, len, mode);
+
        trace_ext4_fallocate_enter(inode, offset, len, mode);
        lblk = offset >> blkbits;
        /*
index ab3e835..7cc2455 100644 (file)
@@ -503,6 +503,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 {
        struct extent_status es;
        int retval;
+       int ret = 0;
 #ifdef ES_AGGRESSIVE_TEST
        struct ext4_map_blocks orig_map;
 
@@ -558,7 +559,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                                             EXT4_GET_BLOCKS_KEEP_SIZE);
        }
        if (retval > 0) {
-               int ret;
                unsigned int status;
 
                if (unlikely(retval != map->m_len)) {
@@ -585,7 +585,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 
 found:
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-               int ret = check_block_validity(inode, map);
+               ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }
@@ -602,7 +602,13 @@ found:
         * with buffer head unmapped.
         */
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
-               return retval;
+               /*
+                * If we need to convert extent to unwritten
+                * we continue and do the actual work in
+                * ext4_ext_map_blocks()
+                */
+               if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
+                       return retval;
 
        /*
         * Here we clear m_flags because after allocating an new extent,
@@ -658,7 +664,6 @@ found:
                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 
        if (retval > 0) {
-               int ret;
                unsigned int status;
 
                if (unlikely(retval != map->m_len)) {
@@ -693,7 +698,7 @@ found:
 has_zeroout:
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-               int ret = check_block_validity(inode, map);
+               ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }
@@ -3507,7 +3512,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
 
-       trace_ext4_punch_hole(inode, offset, length);
+       trace_ext4_punch_hole(inode, offset, length, 0);
 
        /*
         * Write out all dirty pages to avoid race conditions
index e9d7ee7..010ea89 100644 (file)
@@ -21,6 +21,10 @@ struct extent_status;
 #define FALLOC_FL_COLLAPSE_RANGE       0x08
 #endif
 
+#ifndef FALLOC_FL_ZERO_RANGE
+#define FALLOC_FL_ZERO_RANGE           0x10
+#endif
+
 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
 
 #define show_mballoc_flags(flags) __print_flags(flags, "|",    \
@@ -77,7 +81,8 @@ struct extent_status;
        { FALLOC_FL_KEEP_SIZE,          "KEEP_SIZE"},           \
        { FALLOC_FL_PUNCH_HOLE,         "PUNCH_HOLE"},          \
        { FALLOC_FL_NO_HIDE_STALE,      "NO_HIDE_STALE"},       \
-       { FALLOC_FL_COLLAPSE_RANGE,     "COLLAPSE_RANGE"})
+       { FALLOC_FL_COLLAPSE_RANGE,     "COLLAPSE_RANGE"},      \
+       { FALLOC_FL_ZERO_RANGE,         "ZERO_RANGE"})
 
 
 TRACE_EVENT(ext4_free_inode,
@@ -1339,7 +1344,7 @@ TRACE_EVENT(ext4_direct_IO_exit,
                  __entry->rw, __entry->ret)
 );
 
-TRACE_EVENT(ext4_fallocate_enter,
+DECLARE_EVENT_CLASS(ext4__fallocate_mode,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
 
        TP_ARGS(inode, offset, len, mode),
@@ -1347,23 +1352,45 @@ TRACE_EVENT(ext4_fallocate_enter,
        TP_STRUCT__entry(
                __field(        dev_t,  dev                     )
                __field(        ino_t,  ino                     )
-               __field(        loff_t, pos                     )
-               __field(        loff_t, len                     )
+               __field(        loff_t, offset                  )
+               __field(        loff_t, len                     )
                __field(        int,    mode                    )
        ),
 
        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
-               __entry->pos    = offset;
+               __entry->offset = offset;
                __entry->len    = len;
                __entry->mode   = mode;
        ),
 
-       TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %s",
+       TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 (unsigned long) __entry->ino, __entry->pos,
-                 __entry->len, show_falloc_mode(__entry->mode))
+                 (unsigned long) __entry->ino,
+                 __entry->offset, __entry->len,
+                 show_falloc_mode(__entry->mode))
+);
+
+DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter,
+
+       TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
+
+       TP_ARGS(inode, offset, len, mode)
+);
+
+DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole,
+
+       TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
+
+       TP_ARGS(inode, offset, len, mode)
+);
+
+DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,
+
+       TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
+
+       TP_ARGS(inode, offset, len, mode)
 );
 
 TRACE_EVENT(ext4_fallocate_exit,
@@ -1395,31 +1422,6 @@ TRACE_EVENT(ext4_fallocate_exit,
                  __entry->ret)
 );
 
-TRACE_EVENT(ext4_punch_hole,
-       TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
-
-       TP_ARGS(inode, offset, len),
-
-       TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        ino_t,  ino                     )
-               __field(        loff_t, offset                  )
-               __field(        loff_t, len                     )
-       ),
-
-       TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
-               __entry->ino    = inode->i_ino;
-               __entry->offset = offset;
-               __entry->len    = len;
-       ),
-
-       TP_printk("dev %d,%d ino %lu offset %lld len %lld",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 (unsigned long) __entry->ino,
-                 __entry->offset, __entry->len)
-);
-
 TRACE_EVENT(ext4_unlink_enter,
        TP_PROTO(struct inode *parent, struct dentry *dentry),