Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[pandora-kernel.git] / mm / shmem.c
diff --git a/mm/shmem.c b/mm/shmem.c

index db72d8e..d576b84 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt;
  #include <linux/blkdev.h>
  #include <linux/pagevec.h>
  #include <linux/percpu_counter.h>
+#include <linux/falloc.h>
  #include <linux/splice.h>
  #include <linux/security.h>
  #include <linux/swapops.h>
@@ -83,12 +84,25 @@ struct shmem_xattr {
         char value[0];
  };
  
+/*
+ * shmem_fallocate and shmem_writepage communicate via inode->i_private
+ * (with i_mutex making sure that it has only one user at a time):
+ * we would prefer not to enlarge the shmem inode just for that.
+ */
+struct shmem_falloc {
+       pgoff_t start;          /* start of range currently being fallocated */
+       pgoff_t next;           /* the next page offset to be fallocated */
+       pgoff_t nr_falloced;    /* how many new pages have been fallocated */
+       pgoff_t nr_unswapped;   /* how often writepage refused to swap out */
+};
+
  /* Flag allocation requirements to shmem_getpage */
  enum sgp_type {
         SGP_READ,       /* don't exceed i_size, don't allocate page */
         SGP_CACHE,      /* don't exceed i_size, may allocate page */
         SGP_DIRTY,      /* like SGP_CACHE, but set new page dirty */
-       SGP_WRITE,      /* may exceed i_size, may allocate page */
+       SGP_WRITE,      /* may exceed i_size, may allocate !Uptodate page */
+       SGP_FALLOC,     /* like SGP_WRITE, but make existing page Uptodate */
  };
  
  #ifdef CONFIG_TMPFS
@@ -426,27 +440,31 @@ void shmem_unlock_mapping(struct address_space *mapping)
  
  /*
   * Remove range of pages and swap entries from radix tree, and free them.
+ * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
   */
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+                                                                bool unfalloc)
  {
         struct address_space *mapping = inode->i_mapping;
         struct shmem_inode_info *info = SHMEM_I(inode);
         pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-       unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
-       pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+       pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
+       unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+       unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
         struct pagevec pvec;
         pgoff_t indices[PAGEVEC_SIZE];
         long nr_swaps_freed = 0;
         pgoff_t index;
         int i;
  
-       BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+       if (lend == -1)
+               end = -1;       /* unsigned, so actually very big */
  
         pagevec_init(&pvec, 0);
         index = start;
-       while (index <= end) {
+       while (index < end) {
                 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-                       min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                               min(end - index, (pgoff_t)PAGEVEC_SIZE),
                                                         pvec.pages, indices);
                 if (!pvec.nr)
                         break;
@@ -455,10 +473,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                         struct page *page = pvec.pages[i];
  
                         index = indices[i];
-                       if (index > end)
+                       if (index >= end)
                                 break;
  
                         if (radix_tree_exceptional_entry(page)) {
+                               if (unfalloc)
+                                       continue;
                                 nr_swaps_freed += !shmem_free_swap(mapping,
                                                                 index, page);
                                 continue;
@@ -466,9 +486,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
  
                         if (!trylock_page(page))
                                 continue;
-                       if (page->mapping == mapping) {
-                               VM_BUG_ON(PageWriteback(page));
-                               truncate_inode_page(mapping, page);
+                       if (!unfalloc || !PageUptodate(page)) {
+                               if (page->mapping == mapping) {
+                                       VM_BUG_ON(PageWriteback(page));
+                                       truncate_inode_page(mapping, page);
+                               }
                         }
                         unlock_page(page);
                 }
@@ -479,30 +501,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                 index++;
         }
  
-       if (partial) {
+       if (partial_start) {
                 struct page *page = NULL;
                 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
                 if (page) {
-                       zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+                       unsigned int top = PAGE_CACHE_SIZE;
+                       if (start > end) {
+                               top = partial_end;
+                               partial_end = 0;
+                       }
+                       zero_user_segment(page, partial_start, top);
+                       set_page_dirty(page);
+                       unlock_page(page);
+                       page_cache_release(page);
+               }
+       }
+       if (partial_end) {
+               struct page *page = NULL;
+               shmem_getpage(inode, end, &page, SGP_READ, NULL);
+               if (page) {
+                       zero_user_segment(page, 0, partial_end);
                         set_page_dirty(page);
                         unlock_page(page);
                         page_cache_release(page);
                 }
         }
+       if (start >= end)
+               return;
  
         index = start;
         for ( ; ; ) {
                 cond_resched();
                 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-                       min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                               min(end - index, (pgoff_t)PAGEVEC_SIZE),
                                                         pvec.pages, indices);
                 if (!pvec.nr) {
-                       if (index == start)
+                       if (index == start || unfalloc)
                                 break;
                         index = start;
                         continue;
                 }
-               if (index == start && indices[0] > end) {
+               if ((index == start || unfalloc) && indices[0] >= end) {
                         shmem_deswap_pagevec(&pvec);
                         pagevec_release(&pvec);
                         break;
@@ -512,19 +551,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                         struct page *page = pvec.pages[i];
  
                         index = indices[i];
-                       if (index > end)
+                       if (index >= end)
                                 break;
  
                         if (radix_tree_exceptional_entry(page)) {
+                               if (unfalloc)
+                                       continue;
                                 nr_swaps_freed += !shmem_free_swap(mapping,
                                                                 index, page);
                                 continue;
                         }
  
                         lock_page(page);
-                       if (page->mapping == mapping) {
-                               VM_BUG_ON(PageWriteback(page));
-                               truncate_inode_page(mapping, page);
+                       if (!unfalloc || !PageUptodate(page)) {
+                               if (page->mapping == mapping) {
+                                       VM_BUG_ON(PageWriteback(page));
+                                       truncate_inode_page(mapping, page);
+                               }
                         }
                         unlock_page(page);
                 }
@@ -538,7 +581,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
         info->swapped -= nr_swaps_freed;
         shmem_recalc_inode(inode);
         spin_unlock(&info->lock);
+}
  
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+       shmem_undo_range(inode, lstart, lend, false);
         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
  }
  EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -751,6 +798,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                 WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
                 goto redirty;
         }
+
+       /*
+        * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
+        * value into swapfile.c, the only way we can correctly account for a
+        * fallocated page arriving here is now to initialize it and write it.
+        *
+        * That's okay for a page already fallocated earlier, but if we have
+        * not yet completed the fallocation, then (a) we want to keep track
+        * of this page in case we have to undo it, and (b) it may not be a
+        * good idea to continue anyway, once we're pushing into swap.  So
+        * reactivate the page, and let shmem_fallocate() quit when too many.
+        */
+       if (!PageUptodate(page)) {
+               if (inode->i_private) {
+                       struct shmem_falloc *shmem_falloc;
+                       spin_lock(&inode->i_lock);
+                       shmem_falloc = inode->i_private;
+                       if (shmem_falloc &&
+                           index >= shmem_falloc->start &&
+                           index < shmem_falloc->next)
+                               shmem_falloc->nr_unswapped++;
+                       else
+                               shmem_falloc = NULL;
+                       spin_unlock(&inode->i_lock);
+                       if (shmem_falloc)
+                               goto redirty;
+               }
+               clear_highpage(page);
+               flush_dcache_page(page);
+               SetPageUptodate(page);
+       }
+
         swap = get_swap_page();
         if (!swap.val)
                 goto redirty;
@@ -974,6 +1053,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
         swp_entry_t swap;
         int error;
         int once = 0;
+       int alloced = 0;
  
         if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
                 return -EFBIG;
@@ -985,19 +1065,21 @@ repeat:
                 page = NULL;
         }
  
-       if (sgp != SGP_WRITE &&
+       if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
             ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
                 error = -EINVAL;
                 goto failed;
         }
  
+       /* fallocated page? */
+       if (page && !PageUptodate(page)) {
+               if (sgp != SGP_READ)
+                       goto clear;
+               unlock_page(page);
+               page_cache_release(page);
+               page = NULL;
+       }
         if (page || (sgp == SGP_READ && !swap.val)) {
-               /*
-                * Once we can get the page lock, it must be uptodate:
-                * if there were an error in reading back from swap,
-                * the page would not be inserted into the filecache.
-                */
-               BUG_ON(page && !PageUptodate(page));
                 *pagep = page;
                 return 0;
         }
@@ -1094,19 +1176,36 @@ repeat:
                 inode->i_blocks += BLOCKS_PER_PAGE;
                 shmem_recalc_inode(inode);
                 spin_unlock(&info->lock);
+               alloced = true;
  
-               clear_highpage(page);
-               flush_dcache_page(page);
-               SetPageUptodate(page);
+               /*
+                * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+                */
+               if (sgp == SGP_FALLOC)
+                       sgp = SGP_WRITE;
+clear:
+               /*
+                * Let SGP_WRITE caller clear ends if write does not fill page;
+                * but SGP_FALLOC on a page fallocated earlier must initialize
+                * it now, lest undo on failure cancel our earlier guarantee.
+                */
+               if (sgp != SGP_WRITE) {
+                       clear_highpage(page);
+                       flush_dcache_page(page);
+                       SetPageUptodate(page);
+               }
                 if (sgp == SGP_DIRTY)
                         set_page_dirty(page);
         }
  
         /* Perhaps the file has been truncated since we checked */
-       if (sgp != SGP_WRITE &&
+       if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
             ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
                 error = -EINVAL;
-               goto trunc;
+               if (alloced)
+                       goto trunc;
+               else
+                       goto failed;
         }
         *pagep = page;
         return 0;
@@ -1115,6 +1214,7 @@ repeat:
          * Error recovery.
          */
  trunc:
+       info = SHMEM_I(inode);
         ClearPageDirty(page);
         delete_from_page_cache(page);
         spin_lock(&info->lock);
@@ -1122,6 +1222,7 @@ trunc:
         inode->i_blocks -= BLOCKS_PER_PAGE;
         spin_unlock(&info->lock);
  decused:
+       sbinfo = SHMEM_SB(inode->i_sb);
         if (sbinfo->max_blocks)
                 percpu_counter_add(&sbinfo->used_blocks, -1);
  unacct:
@@ -1307,6 +1408,14 @@ shmem_write_end(struct file *file, struct address_space *mapping,
         if (pos + copied > inode->i_size)
                 i_size_write(inode, pos + copied);
  
+       if (!PageUptodate(page)) {
+               if (copied < PAGE_CACHE_SIZE) {
+                       unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+                       zero_user_segments(page, 0, from,
+                                       from + copied, PAGE_CACHE_SIZE);
+               }
+               SetPageUptodate(page);
+       }
         set_page_dirty(page);
         unlock_page(page);
         page_cache_release(page);
@@ -1565,6 +1674,199 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
         return error;
  }
  
+/*
+ * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
+ */
+static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+                                   pgoff_t index, pgoff_t end, int origin)
+{
+       struct page *page;
+       struct pagevec pvec;
+       pgoff_t indices[PAGEVEC_SIZE];
+       bool done = false;
+       int i;
+
+       pagevec_init(&pvec, 0);
+       pvec.nr = 1;            /* start small: we may be there already */
+       while (!done) {
+               pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+                                       pvec.nr, pvec.pages, indices);
+               if (!pvec.nr) {
+                       if (origin == SEEK_DATA)
+                               index = end;
+                       break;
+               }
+               for (i = 0; i < pvec.nr; i++, index++) {
+                       if (index < indices[i]) {
+                               if (origin == SEEK_HOLE) {
+                                       done = true;
+                                       break;
+                               }
+                               index = indices[i];
+                       }
+                       page = pvec.pages[i];
+                       if (page && !radix_tree_exceptional_entry(page)) {
+                               if (!PageUptodate(page))
+                                       page = NULL;
+                       }
+                       if (index >= end ||
+                           (page && origin == SEEK_DATA) ||
+                           (!page && origin == SEEK_HOLE)) {
+                               done = true;
+                               break;
+                       }
+               }
+               shmem_deswap_pagevec(&pvec);
+               pagevec_release(&pvec);
+               pvec.nr = PAGEVEC_SIZE;
+               cond_resched();
+       }
+       return index;
+}
+
+static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
+{
+       struct address_space *mapping;
+       struct inode *inode;
+       pgoff_t start, end;
+       loff_t new_offset;
+
+       if (origin != SEEK_DATA && origin != SEEK_HOLE)
+               return generic_file_llseek_size(file, offset, origin,
+                                                       MAX_LFS_FILESIZE);
+       mapping = file->f_mapping;
+       inode = mapping->host;
+       mutex_lock(&inode->i_mutex);
+       /* We're holding i_mutex so we can access i_size directly */
+
+       if (offset < 0)
+               offset = -EINVAL;
+       else if (offset >= inode->i_size)
+               offset = -ENXIO;
+       else {
+               start = offset >> PAGE_CACHE_SHIFT;
+               end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+               new_offset = shmem_seek_hole_data(mapping, start, end, origin);
+               new_offset <<= PAGE_CACHE_SHIFT;
+               if (new_offset > offset) {
+                       if (new_offset < inode->i_size)
+                               offset = new_offset;
+                       else if (origin == SEEK_DATA)
+                               offset = -ENXIO;
+                       else
+                               offset = inode->i_size;
+               }
+       }
+
+       if (offset >= 0 && offset != file->f_pos) {
+               file->f_pos = offset;
+               file->f_version = 0;
+       }
+       mutex_unlock(&inode->i_mutex);
+       return offset;
+}
+
+static long shmem_fallocate(struct file *file, int mode, loff_t offset,
+                                                        loff_t len)
+{
+       struct inode *inode = file->f_path.dentry->d_inode;
+       struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+       struct shmem_falloc shmem_falloc;
+       pgoff_t start, index, end;
+       int error;
+
+       mutex_lock(&inode->i_mutex);
+
+       if (mode & FALLOC_FL_PUNCH_HOLE) {
+               struct address_space *mapping = file->f_mapping;
+               loff_t unmap_start = round_up(offset, PAGE_SIZE);
+               loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+
+               if ((u64)unmap_end > (u64)unmap_start)
+                       unmap_mapping_range(mapping, unmap_start,
+                                           1 + unmap_end - unmap_start, 0);
+               shmem_truncate_range(inode, offset, offset + len - 1);
+               /* No need to unmap again: hole-punching leaves COWed pages */
+               error = 0;
+               goto out;
+       }
+
+       /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+       error = inode_newsize_ok(inode, offset + len);
+       if (error)
+               goto out;
+
+       start = offset >> PAGE_CACHE_SHIFT;
+       end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       /* Try to avoid a swapstorm if len is impossible to satisfy */
+       if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
+               error = -ENOSPC;
+               goto out;
+       }
+
+       shmem_falloc.start = start;
+       shmem_falloc.next  = start;
+       shmem_falloc.nr_falloced = 0;
+       shmem_falloc.nr_unswapped = 0;
+       spin_lock(&inode->i_lock);
+       inode->i_private = &shmem_falloc;
+       spin_unlock(&inode->i_lock);
+
+       for (index = start; index < end; index++) {
+               struct page *page;
+
+               /*
+                * Good, the fallocate(2) manpage permits EINTR: we may have
+                * been interrupted because we are using up too much memory.
+                */
+               if (signal_pending(current))
+                       error = -EINTR;
+               else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
+                       error = -ENOMEM;
+               else
+                       error = shmem_getpage(inode, index, &page, SGP_FALLOC,
+                                                                       NULL);
+               if (error) {
+                       /* Remove the !PageUptodate pages we added */
+                       shmem_undo_range(inode,
+                               (loff_t)start << PAGE_CACHE_SHIFT,
+                               (loff_t)index << PAGE_CACHE_SHIFT, true);
+                       goto undone;
+               }
+
+               /*
+                * Inform shmem_writepage() how far we have reached.
+                * No need for lock or barrier: we have the page lock.
+                */
+               shmem_falloc.next++;
+               if (!PageUptodate(page))
+                       shmem_falloc.nr_falloced++;
+
+               /*
+                * If !PageUptodate, leave it that way so that freeable pages
+                * can be recognized if we need to rollback on error later.
+                * But set_page_dirty so that memory pressure will swap rather
+                * than free the pages we are allocating (and SGP_CACHE pages
+                * might still be clean: we now need to mark those dirty too).
+                */
+               set_page_dirty(page);
+               unlock_page(page);
+               page_cache_release(page);
+               cond_resched();
+       }
+
+       if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+               i_size_write(inode, offset + len);
+       inode->i_ctime = CURRENT_TIME;
+undone:
+       spin_lock(&inode->i_lock);
+       inode->i_private = NULL;
+       spin_unlock(&inode->i_lock);
+out:
+       mutex_unlock(&inode->i_mutex);
+       return error;
+}
+
  static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
  {
         struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1768,6 +2070,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                 kaddr = kmap_atomic(page);
                 memcpy(kaddr, symname, len);
                 kunmap_atomic(kaddr);
+               SetPageUptodate(page);
                 set_page_dirty(page);
                 unlock_page(page);
                 page_cache_release(page);
@@ -2373,6 +2676,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
                 }
         }
         sb->s_export_op = &shmem_export_ops;
+       sb->s_flags |= MS_NOSEC;
  #else
         sb->s_flags |= MS_NOUSER;
  #endif
@@ -2467,7 +2771,7 @@ static const struct address_space_operations shmem_aops = {
  static const struct file_operations shmem_file_operations = {
         .mmap           = shmem_mmap,
  #ifdef CONFIG_TMPFS
-       .llseek         = generic_file_llseek,
+       .llseek         = shmem_file_llseek,
         .read           = do_sync_read,
         .write          = do_sync_write,
         .aio_read       = shmem_file_aio_read,
@@ -2475,12 +2779,12 @@ static const struct file_operations shmem_file_operations = {
         .fsync          = noop_fsync,
         .splice_read    = shmem_file_splice_read,
         .splice_write   = generic_file_splice_write,
+       .fallocate      = shmem_fallocate,
  #endif
  };
  
  static const struct inode_operations shmem_inode_operations = {
         .setattr        = shmem_setattr,
-       .truncate_range = shmem_truncate_range,
  #ifdef CONFIG_TMPFS_XATTR
         .setxattr       = shmem_setxattr,
         .getxattr       = shmem_getxattr,