ocfs2: zero tail of sparse files on truncate
authorMark Fasheh <mark.fasheh@oracle.com>
Fri, 16 Feb 2007 19:46:50 +0000 (11:46 -0800)
committerMark Fasheh <mark.fasheh@oracle.com>
Thu, 26 Apr 2007 22:02:20 +0000 (15:02 -0700)
Since we don't zero on extend anymore, truncate needs to be fixed up to zero
the part of a file between i_size and and end of it's cluster. Otherwise a
subsequent extend could expose bad data.

This introduced a new helper, which can be used in ocfs2_write().

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
fs/ocfs2/alloc.c
fs/ocfs2/alloc.h
fs/ocfs2/aops.c
fs/ocfs2/aops.h
fs/ocfs2/file.c
fs/ocfs2/inode.c
fs/ocfs2/ocfs2.h

index 9a40603..98694a1 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/swap.h>
 
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -34,6 +35,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "aops.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -3342,6 +3344,228 @@ bail:
        return status;
 }
 
+static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+       set_buffer_uptodate(bh);
+       mark_buffer_dirty(bh);
+       return 0;
+}
+
+static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+       set_buffer_uptodate(bh);
+       mark_buffer_dirty(bh);
+       return ocfs2_journal_dirty_data(handle, bh);
+}
+
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
+                                    struct page **pages, int numpages,
+                                    u64 phys, handle_t *handle)
+{
+       int i, ret, partial = 0;
+       void *kaddr;
+       struct page *page;
+       unsigned int from, to = PAGE_CACHE_SIZE;
+       struct super_block *sb = inode->i_sb;
+
+       BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+       if (numpages == 0)
+               goto out;
+
+       from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
+       if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
+               /*
+                * Since 'from' has been capped to a value below page
+                * size, this calculation won't be able to overflow
+                * 'to'
+                */
+               to = ocfs2_align_bytes_to_clusters(sb, from);
+
+               /*
+                * The truncate tail in this case should never contain
+                * more than one page at maximum. The loop below also
+                * assumes this.
+                */
+               BUG_ON(numpages != 1);
+       }
+
+       for(i = 0; i < numpages; i++) {
+               page = pages[i];
+
+               BUG_ON(from > PAGE_CACHE_SIZE);
+               BUG_ON(to > PAGE_CACHE_SIZE);
+
+               ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
+               if (ret)
+                       mlog_errno(ret);
+
+               kaddr = kmap_atomic(page, KM_USER0);
+               memset(kaddr + from, 0, to - from);
+               kunmap_atomic(kaddr, KM_USER0);
+
+               /*
+                * Need to set the buffers we zero'd into uptodate
+                * here if they aren't - ocfs2_map_page_blocks()
+                * might've skipped some
+                */
+               if (ocfs2_should_order_data(inode)) {
+                       ret = walk_page_buffers(handle,
+                                               page_buffers(page),
+                                               from, to, &partial,
+                                               ocfs2_ordered_zero_func);
+                       if (ret < 0)
+                               mlog_errno(ret);
+               } else {
+                       ret = walk_page_buffers(handle, page_buffers(page),
+                                               from, to, &partial,
+                                               ocfs2_writeback_zero_func);
+                       if (ret < 0)
+                               mlog_errno(ret);
+               }
+
+               if (!partial)
+                       SetPageUptodate(page);
+
+               flush_dcache_page(page);
+
+               /*
+                * Every page after the 1st one should be completely zero'd.
+                */
+               from = 0;
+       }
+out:
+       if (pages) {
+               for (i = 0; i < numpages; i++) {
+                       page = pages[i];
+                       unlock_page(page);
+                       mark_page_accessed(page);
+                       page_cache_release(page);
+               }
+       }
+}
+
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
+                               int *num, u64 *phys)
+{
+       int i, numpages = 0, ret = 0;
+       unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
+       struct super_block *sb = inode->i_sb;
+       struct address_space *mapping = inode->i_mapping;
+       unsigned long index;
+       u64 next_cluster_bytes;
+
+       BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+       /* Cluster boundary, so we don't need to grab any pages. */
+       if ((isize & (csize - 1)) == 0)
+               goto out;
+
+       ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+                                         phys, NULL);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /* Tail is a hole. */
+       if (*phys == 0)
+               goto out;
+
+       next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
+       index = isize >> PAGE_CACHE_SHIFT;
+       do {
+               pages[numpages] = grab_cache_page(mapping, index);
+               if (!pages[numpages]) {
+                       ret = -ENOMEM;
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               numpages++;
+               index++;
+       } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
+
+out:
+       if (ret != 0) {
+               if (pages) {
+                       for (i = 0; i < numpages; i++) {
+                               if (pages[i]) {
+                                       unlock_page(pages[i]);
+                                       page_cache_release(pages[i]);
+                               }
+                       }
+               }
+               numpages = 0;
+       }
+
+       *num = numpages;
+
+       return ret;
+}
+
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+                                u64 new_i_size)
+{
+       int ret, numpages;
+       struct page **pages = NULL;
+       u64 phys;
+
+       /*
+        * File systems which don't support sparse files zero on every
+        * extend.
+        */
+       if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+               return 0;
+
+       pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
+                       sizeof(struct page *), GFP_NOFS);
+       if (pages == NULL) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /*
+        * Truncate on an i_size boundary - nothing more to do.
+        */
+       if (numpages == 0)
+               goto out;
+
+       ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
+                                handle);
+
+       /*
+        * Initiate writeout of the pages we zero'd here. We don't
+        * wait on them - the truncate_inode_pages() call later will
+        * do that for us.
+        */
+       ret = filemap_fdatawrite(inode->i_mapping);
+       if (ret)
+               mlog_errno(ret);
+
+out:
+       if (pages)
+               kfree(pages);
+
+       return ret;
+}
+
 /*
  * It is expected, that by the time you call this function,
  * inode->i_size and fe->i_size have been adjusted.
index bff2a16..3cb39cd 100644 (file)
@@ -71,6 +71,8 @@ struct ocfs2_truncate_context {
        struct buffer_head *tc_last_eb_bh;
 };
 
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+                                u64 new_i_size);
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct buffer_head *fe_bh,
index acf8f00..605c82a 100644 (file)
@@ -308,13 +308,13 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
  * functionality yet, but IMHO it's better to cut and paste the whole
  * thing so we can avoid introducing our own bugs (and easily pick up
  * their fixes when they happen) --Mark */
-static int walk_page_buffers(  handle_t *handle,
-                               struct buffer_head *head,
-                               unsigned from,
-                               unsigned to,
-                               int *partial,
-                               int (*fn)(      handle_t *handle,
-                                               struct buffer_head *bh))
+int walk_page_buffers( handle_t *handle,
+                       struct buffer_head *head,
+                       unsigned from,
+                       unsigned to,
+                       int *partial,
+                       int (*fn)(      handle_t *handle,
+                                       struct buffer_head *bh))
 {
        struct buffer_head *bh;
        unsigned block_start, block_end;
@@ -654,9 +654,9 @@ static void ocfs2_clear_page_regions(struct page *page,
  *
  * This will also skip zeroing, which is handled externally.
  */
-static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
-                                struct inode *inode, unsigned int from,
-                                unsigned int to, int new)
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+                         struct inode *inode, unsigned int from,
+                         unsigned int to, int new)
 {
        int ret = 0;
        struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
@@ -675,8 +675,7 @@ static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
                 * Ignore blocks outside of our i/o range -
                 * they may belong to unallocated clusters.
                 */
-               if (block_start >= to ||
-                   (block_start + bsize) <= from) {
+               if (block_start >= to || block_end <= from) {
                        if (PageUptodate(page))
                                set_buffer_uptodate(bh);
                        continue;
@@ -971,7 +970,6 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
        u64 v_blkno, p_blkno;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
-       unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
        unsigned long index, start;
        struct page **cpages;
 
@@ -979,13 +977,11 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
 
        /*
         * Figure out how many pages we'll be manipulating here. For
-        * non-allocating write, or any writes where cluster size is
-        * less than page size, we only need one page. Otherwise,
-        * allocating writes of cluster size larger than page size
-        * need cluster size pages.
+        * non allocating write, we just change the one
+        * page. Otherwise, we'll need a whole clusters worth.
         */
-       if (new && !wc->w_large_pages)
-               numpages = (1 << cbits) / PAGE_SIZE;
+       if (new)
+               numpages = ocfs2_pages_per_cluster(inode->i_sb);
 
        cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
        if (!cpages) {
index eeb2c42..7d94071 100644 (file)
@@ -30,6 +30,18 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                         unsigned from,
                                                         unsigned to);
 
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+                         struct inode *inode, unsigned int from,
+                         unsigned int to, int new);
+
+int walk_page_buffers( handle_t *handle,
+                       struct buffer_head *head,
+                       unsigned from,
+                       unsigned to,
+                       int *partial,
+                       int (*fn)(      handle_t *handle,
+                                       struct buffer_head *bh));
+
 struct ocfs2_write_ctxt;
 typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
                                u64 *, unsigned int *, unsigned int *);
index 667e5a8..5fd49ec 100644 (file)
@@ -262,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 {
        int status;
        handle_t *handle;
+       struct ocfs2_dinode *di;
 
        mlog_entry_void();
 
@@ -275,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
                goto out;
        }
 
-       status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+       status = ocfs2_journal_access(handle, inode, fe_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (status < 0) {
+               mlog_errno(status);
+               goto out_commit;
+       }
+
+       /*
+        * Do this before setting i_size.
+        */
+       status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+       if (status) {
+               mlog_errno(status);
+               goto out_commit;
+       }
+
+       i_size_write(inode, new_i_size);
+       inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+       di = (struct ocfs2_dinode *) fe_bh->b_data;
+       di->i_size = cpu_to_le64(new_i_size);
+       di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+       di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+       status = ocfs2_journal_dirty(handle, fe_bh);
        if (status < 0)
                mlog_errno(status);
 
+out_commit:
        ocfs2_commit_trans(osb, handle);
 out:
+
        mlog_exit(status);
        return status;
 }
@@ -343,7 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
                mlog_errno(status);
                goto bail;
        }
-       ocfs2_data_unlock(inode, 1);
 
        /* alright, we're going to need to do a full blown alloc size
         * change. Orphan the inode so that recovery can complete the
@@ -352,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
        status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
        if (status < 0) {
                mlog_errno(status);
-               goto bail;
+               goto bail_unlock_data;
        }
 
        status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
        if (status < 0) {
                mlog_errno(status);
-               goto bail;
+               goto bail_unlock_data;
        }
 
        status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
        if (status < 0) {
                mlog_errno(status);
-               goto bail;
+               goto bail_unlock_data;
        }
 
        /* TODO: orphan dir cleanup here. */
+bail_unlock_data:
+       ocfs2_data_unlock(inode, 1);
+
 bail:
 
        mlog_exit(status);
index 0bd86a1..78c99b5 100644 (file)
@@ -489,12 +489,38 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
        int status = 0;
        struct ocfs2_truncate_context *tc = NULL;
        struct ocfs2_dinode *fe;
+       handle_t *handle = NULL;
 
        mlog_entry_void();
 
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
        if (fe->i_clusters) {
+               handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+               if (IS_ERR(handle)) {
+                       status = PTR_ERR(handle);
+                       mlog_errno(status);
+                       goto out;
+               }
+
+               status = ocfs2_journal_access(handle, inode, fe_bh,
+                                             OCFS2_JOURNAL_ACCESS_WRITE);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto out;
+               }
+
+               i_size_write(inode, 0);
+
+               status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto out;
+               }
+
+               ocfs2_commit_trans(osb, handle);
+               handle = NULL;
+
                status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
                if (status < 0) {
                        mlog_errno(status);
@@ -507,8 +533,10 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                        goto out;
                }
        }
-out:
 
+out:
+       if (handle)
+               ocfs2_commit_trans(osb, handle);
        mlog_exit(status);
        return status;
 }
index 2699f7c..82cc92d 100644 (file)
@@ -495,6 +495,17 @@ static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_bloc
        return index;
 }
 
+static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
+{
+       unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+       unsigned int pages_per_cluster = 1;
+
+       if (PAGE_CACHE_SHIFT < cbits)
+               pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+
+       return pages_per_cluster;
+}
+
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit