Merge branch 'master'

[pandora-kernel.git] / mm / filemap.c
diff --git a/mm/filemap.c b/mm/filemap.c

index 478f4c7..82c4488 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -15,6 +15,7 @@
  #include <linux/compiler.h>
  #include <linux/fs.h>
  #include <linux/aio.h>
+#include <linux/capability.h>
  #include <linux/kernel_stat.h>
  #include <linux/mm.h>
  #include <linux/swap.h>
@@ -28,7 +29,10 @@
  #include <linux/blkdev.h>
  #include <linux/security.h>
  #include <linux/syscalls.h>
+#include <linux/cpuset.h>
  #include "filemap.h"
+#include "internal.h"
+
  /*
   * FIXME: remove all knowledge of the buffer layer from the core VM
   */
@@ -61,7 +65,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
   *      ->swap_lock            (exclusive_swap_page, others)
   *        ->mapping->tree_lock
   *
- *  ->i_sem
+ *  ->i_mutex
   *    ->i_mmap_lock            (truncate->unmap_mapping_range)
   *
   *  ->mmap_sem
@@ -73,9 +77,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
   *    ->lock_page              (access_process_vm)
   *
   *  ->mmap_sem
- *    ->i_sem                  (msync)
+ *    ->i_mutex                        (msync)
   *
- *  ->i_sem
+ *  ->i_mutex
   *    ->i_alloc_sem             (various)
   *
   *  ->inode_lock
@@ -93,6 +97,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
   *    ->private_lock           (try_to_unmap_one)
   *    ->tree_lock              (try_to_unmap_one)
   *    ->zone.lru_lock          (follow_page->mark_page_accessed)
+ *    ->zone.lru_lock          (check_pte_range->isolate_lru_page)
   *    ->private_lock           (page_remove_rmap->set_page_dirty)
   *    ->tree_lock              (page_remove_rmap->set_page_dirty)
   *    ->inode_lock             (page_remove_rmap->set_page_dirty)
@@ -170,7 +175,7 @@ static int sync_page(void *word)
   * dirty pages that lie within the byte offsets <start, end>
   * @mapping:   address space structure to write
   * @start:     offset in bytes where the range starts
- * @end:       offset in bytes where the range ends
+ * @end:       offset in bytes where the range ends (inclusive)
   * @sync_mode: enable synchronous operation
   *
   * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
@@ -178,8 +183,8 @@ static int sync_page(void *word)
   * these two operations is that if a dirty page/buffer is encountered, it must
   * be waited upon, and not just skipped over.
   */
-static int __filemap_fdatawrite_range(struct address_space *mapping,
-       loff_t start, loff_t end, int sync_mode)
+int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+                               loff_t end, int sync_mode)
  {
         int ret;
         struct writeback_control wbc = {
@@ -208,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping)
  }
  EXPORT_SYMBOL(filemap_fdatawrite);
  
-static int filemap_fdatawrite_range(struct address_space *mapping,
-       loff_t start, loff_t end)
+static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+                               loff_t end)
  {
         return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
  }
@@ -228,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush);
   * Wait for writeback to complete against pages indexed by start->end
   * inclusive
   */
-static int wait_on_page_writeback_range(struct address_space *mapping,
+int wait_on_page_writeback_range(struct address_space *mapping,
                                 pgoff_t start, pgoff_t end)
  {
         struct pagevec pvec;
@@ -276,7 +281,7 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
   * integrity" operation.  It waits upon in-flight writeout before starting and
   * waiting upon new writeout.  If there was an IO error, return it.
   *
- * We need to re-take i_sem during the generic_osync_inode list walk because
+ * We need to re-take i_mutex during the generic_osync_inode list walk because
   * it is otherwise livelockable.
   */
  int sync_page_range(struct inode *inode, struct address_space *mapping,
@@ -290,9 +295,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
                 return 0;
         ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
         if (ret == 0) {
-               down(&inode->i_sem);
+               mutex_lock(&inode->i_mutex);
                 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-               up(&inode->i_sem);
+               mutex_unlock(&inode->i_mutex);
         }
         if (ret == 0)
                 ret = wait_on_page_writeback_range(mapping, start, end);
@@ -301,7 +306,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
  EXPORT_SYMBOL(sync_page_range);
  
  /*
- * Note: Holding i_sem across sync_page_range_nolock is not a good idea
+ * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
   * as it forces O_SYNC writers to different parts of the same file
   * to be serialised right until io completion.
   */
@@ -363,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping)
  }
  EXPORT_SYMBOL(filemap_write_and_wait);
  
+/*
+ * Write out and wait upon file offsets lstart->lend, inclusive.
+ *
+ * Note that `lend' is inclusive (describes the last byte to be written) so
+ * that this function can be used to write to the very end-of-file (end = -1).
+ */
  int filemap_write_and_wait_range(struct address_space *mapping,
                                  loff_t lstart, loff_t lend)
  {
@@ -423,6 +434,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
         return ret;
  }
  
+#ifdef CONFIG_NUMA
+struct page *page_cache_alloc(struct address_space *x)
+{
+       if (cpuset_do_page_mem_spread()) {
+               int n = cpuset_mem_spread_node();
+               return alloc_pages_node(n, mapping_gfp_mask(x), 0);
+       }
+       return alloc_pages(mapping_gfp_mask(x), 0);
+}
+EXPORT_SYMBOL(page_cache_alloc);
+
+struct page *page_cache_alloc_cold(struct address_space *x)
+{
+       if (cpuset_do_page_mem_spread()) {
+               int n = cpuset_mem_spread_node();
+               return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
+       }
+       return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
+}
+EXPORT_SYMBOL(page_cache_alloc_cold);
+#endif
+
  /*
   * In order to wait for pages to become available there must be
   * waitqueues associated with pages. By using a hash table of
@@ -664,6 +697,38 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
         return ret;
  }
  
+/**
+ * find_get_pages_contig - gang contiguous pagecache lookup
+ * @mapping:   The address_space to search
+ * @index:     The starting page index
+ * @nr_pages:  The maximum number of pages
+ * @pages:     Where the resulting pages are placed
+ *
+ * find_get_pages_contig() works exactly like find_get_pages(), except
+ * that the returned number of pages are guaranteed to be contiguous.
+ *
+ * find_get_pages_contig() returns the number of pages which were found.
+ */
+unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
+                              unsigned int nr_pages, struct page **pages)
+{
+       unsigned int i;
+       unsigned int ret;
+
+       read_lock_irq(&mapping->tree_lock);
+       ret = radix_tree_gang_lookup(&mapping->page_tree,
+                               (void **)pages, index, nr_pages);
+       for (i = 0; i < ret; i++) {
+               if (pages[i]->mapping == NULL || pages[i]->index != index)
+                       break;
+
+               page_cache_get(pages[i]);
+               index++;
+       }
+       read_unlock_irq(&mapping->tree_lock);
+       return i;
+}
+
  /*
   * Like find_get_pages, except we only return pages which are tagged with
   * `tag'.   We update *index to index the next page for the traversal.
@@ -979,6 +1044,7 @@ success:
         desc->arg.buf += size;
         return size;
  }
+EXPORT_SYMBOL(file_read_actor);
  
  /*
   * This is the "read()" routine for all filesystems
@@ -1892,7 +1958,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
         /*
          * Sync the fs metadata but not the minor inode changes and
          * of course not the data as we did direct DMA for the IO.
-        * i_sem is held, which protects generic_osync_inode() from
+        * i_mutex is held, which protects generic_osync_inode() from
          * livelocking.
          */
         if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -2108,7 +2174,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
         if (err)
                 goto out;
  
-       inode_update_time(inode, 1);
+       file_update_time(file);
  
         /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
         if (unlikely(file->f_flags & O_DIRECT)) {
@@ -2195,10 +2261,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
  
         BUG_ON(iocb->ki_pos != pos);
  
-       down(&inode->i_sem);
+       mutex_lock(&inode->i_mutex);
         ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
                                                 &iocb->ki_pos);
-       up(&inode->i_sem);
+       mutex_unlock(&inode->i_mutex);
  
         if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                 ssize_t err;
@@ -2220,9 +2286,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf,
         struct iovec local_iov = { .iov_base = (void __user *)buf,
                                         .iov_len = count };
  
-       down(&inode->i_sem);
+       mutex_lock(&inode->i_mutex);
         ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
-       up(&inode->i_sem);
+       mutex_unlock(&inode->i_mutex);
  
         if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                 ssize_t err;
@@ -2256,9 +2322,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
         struct inode *inode = mapping->host;
         ssize_t ret;
  
-       down(&inode->i_sem);
+       mutex_lock(&inode->i_mutex);
         ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
-       up(&inode->i_sem);
+       mutex_unlock(&inode->i_mutex);
  
         if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                 int err;
@@ -2272,7 +2338,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
  EXPORT_SYMBOL(generic_file_writev);
  
  /*
- * Called under i_sem for writes to S_ISREG files.   Returns -EIO if something
+ * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
   * went wrong during pagecache shootdown.
   */
  static ssize_t