ocfs2: zero tail of sparse files on truncate

author Mark Fasheh <mark.fasheh@oracle.com>

Fri, 16 Feb 2007 19:46:50 +0000 (11:46 -0800)

committer Mark Fasheh <mark.fasheh@oracle.com>

Thu, 26 Apr 2007 22:02:20 +0000 (15:02 -0700)
author Mark Fasheh <mark.fasheh@oracle.com>
Fri, 16 Feb 2007 19:46:50 +0000 (11:46 -0800)
committer Mark Fasheh <mark.fasheh@oracle.com>
Thu, 26 Apr 2007 22:02:20 +0000 (15:02 -0700)
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c

index 9a40603..98694a1 100644 (file)
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -27,6 +27,7 @@
  #include <linux/types.h>
  #include <linux/slab.h>
  #include <linux/highmem.h>
+#include <linux/swap.h>
  
  #define MLOG_MASK_PREFIX ML_DISK_ALLOC
  #include <cluster/masklog.h>
@@ -34,6 +35,7 @@
  #include "ocfs2.h"
  
  #include "alloc.h"
+#include "aops.h"
  #include "dlmglue.h"
  #include "extent_map.h"
  #include "inode.h"
@@ -3342,6 +3344,228 @@ bail:
         return status;
  }
  
+static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+       set_buffer_uptodate(bh);
+       mark_buffer_dirty(bh);
+       return 0;
+}
+
+static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+       set_buffer_uptodate(bh);
+       mark_buffer_dirty(bh);
+       return ocfs2_journal_dirty_data(handle, bh);
+}
+
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
+                                    struct page **pages, int numpages,
+                                    u64 phys, handle_t *handle)
+{
+       int i, ret, partial = 0;
+       void *kaddr;
+       struct page *page;
+       unsigned int from, to = PAGE_CACHE_SIZE;
+       struct super_block *sb = inode->i_sb;
+
+       BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+       if (numpages == 0)
+               goto out;
+
+       from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
+       if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
+               /*
+                * Since 'from' has been capped to a value below page
+                * size, this calculation won't be able to overflow
+                * 'to'
+                */
+               to = ocfs2_align_bytes_to_clusters(sb, from);
+
+               /*
+                * The truncate tail in this case should never contain
+                * more than one page at maximum. The loop below also
+                * assumes this.
+                */
+               BUG_ON(numpages != 1);
+       }
+
+       for(i = 0; i < numpages; i++) {
+               page = pages[i];
+
+               BUG_ON(from > PAGE_CACHE_SIZE);
+               BUG_ON(to > PAGE_CACHE_SIZE);
+
+               ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
+               if (ret)
+                       mlog_errno(ret);
+
+               kaddr = kmap_atomic(page, KM_USER0);
+               memset(kaddr + from, 0, to - from);
+               kunmap_atomic(kaddr, KM_USER0);
+
+               /*
+                * Need to set the buffers we zero'd into uptodate
+                * here if they aren't - ocfs2_map_page_blocks()
+                * might've skipped some
+                */
+               if (ocfs2_should_order_data(inode)) {
+                       ret = walk_page_buffers(handle,
+                                               page_buffers(page),
+                                               from, to, &partial,
+                                               ocfs2_ordered_zero_func);
+                       if (ret < 0)
+                               mlog_errno(ret);
+               } else {
+                       ret = walk_page_buffers(handle, page_buffers(page),
+                                               from, to, &partial,
+                                               ocfs2_writeback_zero_func);
+                       if (ret < 0)
+                               mlog_errno(ret);
+               }
+
+               if (!partial)
+                       SetPageUptodate(page);
+
+               flush_dcache_page(page);
+
+               /*
+                * Every page after the 1st one should be completely zero'd.
+                */
+               from = 0;
+       }
+out:
+       if (pages) {
+               for (i = 0; i < numpages; i++) {
+                       page = pages[i];
+                       unlock_page(page);
+                       mark_page_accessed(page);
+                       page_cache_release(page);
+               }
+       }
+}
+
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
+                               int *num, u64 *phys)
+{
+       int i, numpages = 0, ret = 0;
+       unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
+       struct super_block *sb = inode->i_sb;
+       struct address_space *mapping = inode->i_mapping;
+       unsigned long index;
+       u64 next_cluster_bytes;
+
+       BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+       /* Cluster boundary, so we don't need to grab any pages. */
+       if ((isize & (csize - 1)) == 0)
+               goto out;
+
+       ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+                                         phys, NULL);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /* Tail is a hole. */
+       if (*phys == 0)
+               goto out;
+
+       next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
+       index = isize >> PAGE_CACHE_SHIFT;
+       do {
+               pages[numpages] = grab_cache_page(mapping, index);
+               if (!pages[numpages]) {
+                       ret = -ENOMEM;
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               numpages++;
+               index++;
+       } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
+
+out:
+       if (ret != 0) {
+               if (pages) {
+                       for (i = 0; i < numpages; i++) {
+                               if (pages[i]) {
+                                       unlock_page(pages[i]);
+                                       page_cache_release(pages[i]);
+                               }
+                       }
+               }
+               numpages = 0;
+       }
+
+       *num = numpages;
+
+       return ret;
+}
+
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+                                u64 new_i_size)
+{
+       int ret, numpages;
+       struct page **pages = NULL;
+       u64 phys;
+
+       /*
+        * File systems which don't support sparse files zero on every
+        * extend.
+        */
+       if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+               return 0;
+
+       pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
+                       sizeof(struct page *), GFP_NOFS);
+       if (pages == NULL) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /*
+        * Truncate on an i_size boundary - nothing more to do.
+        */
+       if (numpages == 0)
+               goto out;
+
+       ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
+                                handle);
+
+       /*
+        * Initiate writeout of the pages we zero'd here. We don't
+        * wait on them - the truncate_inode_pages() call later will
+        * do that for us.
+        */
+       ret = filemap_fdatawrite(inode->i_mapping);
+       if (ret)
+               mlog_errno(ret);
+
+out:
+       if (pages)
+               kfree(pages);
+
+       return ret;
+}
+
  /*
   * It is expected, that by the time you call this function,
   * inode->i_size and fe->i_size have been adjusted.
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h

index bff2a16..3cb39cd 100644 (file)
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -71,6 +71,8 @@ struct ocfs2_truncate_context {
         struct buffer_head *tc_last_eb_bh;
  };
  
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+                                u64 new_i_size);
  int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                            struct inode *inode,
                            struct buffer_head *fe_bh,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c

index acf8f00..605c82a 100644 (file)
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -308,13 +308,13 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
   * functionality yet, but IMHO it's better to cut and paste the whole
   * thing so we can avoid introducing our own bugs (and easily pick up
   * their fixes when they happen) --Mark */
-static int walk_page_buffers(  handle_t *handle,
-                               struct buffer_head *head,
-                               unsigned from,
-                               unsigned to,
-                               int *partial,
-                               int (*fn)(      handle_t *handle,
-                                               struct buffer_head *bh))
+int walk_page_buffers( handle_t *handle,
+                       struct buffer_head *head,
+                       unsigned from,
+                       unsigned to,
+                       int *partial,
+                       int (*fn)(      handle_t *handle,
+                                       struct buffer_head *bh))
  {
         struct buffer_head *bh;
         unsigned block_start, block_end;
@@ -654,9 +654,9 @@ static void ocfs2_clear_page_regions(struct page *page,
   *
   * This will also skip zeroing, which is handled externally.
   */
-static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
-                                struct inode *inode, unsigned int from,
-                                unsigned int to, int new)
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+                         struct inode *inode, unsigned int from,
+                         unsigned int to, int new)
  {
         int ret = 0;
         struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
@@ -675,8 +675,7 @@ static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
                  * Ignore blocks outside of our i/o range -
                  * they may belong to unallocated clusters.
                  */
-               if (block_start >= to ||
-                   (block_start + bsize) <= from) {
+               if (block_start >= to || block_end <= from) {
                         if (PageUptodate(page))
                                 set_buffer_uptodate(bh);
                         continue;
@@ -971,7 +970,6 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
         u64 v_blkno, p_blkno;
         struct address_space *mapping = file->f_mapping;
         struct inode *inode = mapping->host;
-       unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
         unsigned long index, start;
         struct page **cpages;
  
@@ -979,13 +977,11 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
  
         /*
          * Figure out how many pages we'll be manipulating here. For
-        * non-allocating write, or any writes where cluster size is
-        * less than page size, we only need one page. Otherwise,
-        * allocating writes of cluster size larger than page size
-        * need cluster size pages.
+        * non allocating write, we just change the one
+        * page. Otherwise, we'll need a whole clusters worth.
          */
-       if (new && !wc->w_large_pages)
-               numpages = (1 << cbits) / PAGE_SIZE;
+       if (new)
+               numpages = ocfs2_pages_per_cluster(inode->i_sb);
  
         cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
         if (!cpages) {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h

index eeb2c42..7d94071 100644 (file)
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -30,6 +30,18 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                          unsigned from,
                                                          unsigned to);
  
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+                         struct inode *inode, unsigned int from,
+                         unsigned int to, int new);
+
+int walk_page_buffers( handle_t *handle,
+                       struct buffer_head *head,
+                       unsigned from,
+                       unsigned to,
+                       int *partial,
+                       int (*fn)(      handle_t *handle,
+                                       struct buffer_head *bh));
+
  struct ocfs2_write_ctxt;
  typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
                                 u64 *, unsigned int *, unsigned int *);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c

index 667e5a8..5fd49ec 100644 (file)
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -262,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
  {
         int status;
         handle_t *handle;
+       struct ocfs2_dinode *di;
  
         mlog_entry_void();
  
@@ -275,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
                 goto out;
         }
  
-       status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+       status = ocfs2_journal_access(handle, inode, fe_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (status < 0) {
+               mlog_errno(status);
+               goto out_commit;
+       }
+
+       /*
+        * Do this before setting i_size.
+        */
+       status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+       if (status) {
+               mlog_errno(status);
+               goto out_commit;
+       }
+
+       i_size_write(inode, new_i_size);
+       inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+       di = (struct ocfs2_dinode *) fe_bh->b_data;
+       di->i_size = cpu_to_le64(new_i_size);
+       di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+       di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+       status = ocfs2_journal_dirty(handle, fe_bh);
         if (status < 0)
                 mlog_errno(status);
  
+out_commit:
         ocfs2_commit_trans(osb, handle);
  out:
+
         mlog_exit(status);
         return status;
  }
@@ -343,7 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
                 mlog_errno(status);
                 goto bail;
         }
-       ocfs2_data_unlock(inode, 1);
  
         /* alright, we're going to need to do a full blown alloc size
          * change. Orphan the inode so that recovery can complete the
@@ -352,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
         status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
         if (status < 0) {
                 mlog_errno(status);
-               goto bail;
+               goto bail_unlock_data;
         }
  
         status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
         if (status < 0) {
                 mlog_errno(status);
-               goto bail;
+               goto bail_unlock_data;
         }
  
         status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
         if (status < 0) {
                 mlog_errno(status);
-               goto bail;
+               goto bail_unlock_data;
         }
  
         /* TODO: orphan dir cleanup here. */
+bail_unlock_data:
+       ocfs2_data_unlock(inode, 1);
+
  bail:
  
         mlog_exit(status);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c

index 0bd86a1..78c99b5 100644 (file)
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -489,12 +489,38 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
         int status = 0;
         struct ocfs2_truncate_context *tc = NULL;
         struct ocfs2_dinode *fe;
+       handle_t *handle = NULL;
  
         mlog_entry_void();
  
         fe = (struct ocfs2_dinode *) fe_bh->b_data;
  
         if (fe->i_clusters) {
+               handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+               if (IS_ERR(handle)) {
+                       status = PTR_ERR(handle);
+                       mlog_errno(status);
+                       goto out;
+               }
+
+               status = ocfs2_journal_access(handle, inode, fe_bh,
+                                             OCFS2_JOURNAL_ACCESS_WRITE);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto out;
+               }
+
+               i_size_write(inode, 0);
+
+               status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto out;
+               }
+
+               ocfs2_commit_trans(osb, handle);
+               handle = NULL;
+
                 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
                 if (status < 0) {
                         mlog_errno(status);
@@ -507,8 +533,10 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                         goto out;
                 }
         }
-out:
  
+out:
+       if (handle)
+               ocfs2_commit_trans(osb, handle);
         mlog_exit(status);
         return status;
  }
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h

index 2699f7c..82cc92d 100644 (file)
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -495,6 +495,17 @@ static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_bloc
         return index;
  }
  
+static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
+{
+       unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+       unsigned int pages_per_cluster = 1;
+
+       if (PAGE_CACHE_SHIFT < cbits)
+               pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+
+       return pages_per_cluster;
+}
+
  #define ocfs2_set_bit ext2_set_bit
  #define ocfs2_clear_bit ext2_clear_bit
  #define ocfs2_test_bit ext2_test_bit
author	Mark Fasheh <mark.fasheh@oracle.com>
	Fri, 16 Feb 2007 19:46:50 +0000 (11:46 -0800)
committer	Mark Fasheh <mark.fasheh@oracle.com>
	Thu, 26 Apr 2007 22:02:20 +0000 (15:02 -0700)
fs/ocfs2/alloc.c		patch \| blob \| history
fs/ocfs2/alloc.h		patch \| blob \| history
fs/ocfs2/aops.c		patch \| blob \| history
fs/ocfs2/aops.h		patch \| blob \| history
fs/ocfs2/file.c		patch \| blob \| history
fs/ocfs2/inode.c		patch \| blob \| history
fs/ocfs2/ocfs2.h		patch \| blob \| history