Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 26 Jul 2011 17:39:54 +0000 (10:39 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 26 Jul 2011 17:39:54 +0000 (10:39 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 26 Jul 2011 17:39:54 +0000 (10:39 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 26 Jul 2011 17:39:54 +0000 (10:39 -0700)
diff --combined fs/block_dev.c

index c62fb84,3c9a03e..f55aad4
--- 1/fs/block_dev.c
--- 2/fs/block_dev.c
+++ b/fs/block_dev.c
@@@ -44,24 -44,28 +44,28 @@@ inline struct block_device *I_BDEV(stru
   {
         return &BDEV_I(inode)->bdev;
   }
- 
   EXPORT_SYMBOL(I_BDEV);
   
   /*
-  * move the inode from it's current bdi to the a new bdi. if the inode is dirty
-  * we need to move it onto the dirty list of @dst so that the inode is always
-  * on the right list.
+  * Move the inode from its current bdi to a new bdi. If the inode is dirty we
+  * need to move it onto the dirty list of @dst so that the inode is always on
+  * the right list.
    */
   static void bdev_inode_switch_bdi(struct inode *inode,
                         struct backing_dev_info *dst)
   {
-       spin_lock(&inode_wb_list_lock);
+       struct backing_dev_info *old = inode->i_data.backing_dev_info;
+ 
+       if (unlikely(dst == old))               /* deadlock avoidance */
+               return;
+       bdi_lock_two(&old->wb, &dst->wb);
         spin_lock(&inode->i_lock);
         inode->i_data.backing_dev_info = dst;
         if (inode->i_state & I_DIRTY)
                 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
         spin_unlock(&inode->i_lock);
-       spin_unlock(&inode_wb_list_lock);
+       spin_unlock(&old->wb.list_lock);
+       spin_unlock(&dst->wb.list_lock);
   }
   
   static sector_t max_block(struct block_device *bdev)
@@@ -355,30 -359,25 +359,30 @@@ static loff_t block_llseek(struct file 
         mutex_lock(&bd_inode->i_mutex);
         size = i_size_read(bd_inode);
   
+ +      retval = -EINVAL;
         switch (origin) {
- -              case 2:
+ +              case SEEK_END:
                         offset += size;
                         break;
- -              case 1:
+ +              case SEEK_CUR:
                         offset += file->f_pos;
+ +              case SEEK_SET:
+ +                      break;
+ +              default:
+ +                      goto out;
         }
- -      retval = -EINVAL;
         if (offset >= 0 && offset <= size) {
                 if (offset != file->f_pos) {
                         file->f_pos = offset;
                 }
                 retval = offset;
         }
+ +out:
         mutex_unlock(&bd_inode->i_mutex);
         return retval;
   }
         
- -int blkdev_fsync(struct file *filp, int datasync)
+ +int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
   {
         struct inode *bd_inode = filp->f_mapping->host;
         struct block_device *bdev = I_BDEV(bd_inode);
@@@ -389,10 -388,14 +393,10 @@@
          * i_mutex and doing so causes performance issues with concurrent
          * O_SYNC writers to a block device.
          */
- -      mutex_unlock(&bd_inode->i_mutex);
- -
         error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
         if (error == -EOPNOTSUPP)
                 error = 0;
   
- -      mutex_lock(&bd_inode->i_mutex);
- -
         return error;
   }
   EXPORT_SYMBOL(blkdev_fsync);
@@@ -763,19 -766,7 +767,19 @@@ static struct block_device *bd_start_cl
         if (!disk)
                 return ERR_PTR(-ENXIO);
   
- -      whole = bdget_disk(disk, 0);
+ +      /*
+ +       * Normally, @bdev should equal what's returned from bdget_disk()
+ +       * if partno is 0; however, some drivers (floppy) use multiple
+ +       * bdev's for the same physical device and @bdev may be one of the
+ +       * aliases.  Keep @bdev if partno is 0.  This means claimer
+ +       * tracking is broken for those devices but it has always been that
+ +       * way.
+ +       */
+ +      if (partno)
+ +              whole = bdget_disk(disk, 0);
+ +      else
+ +              whole = bdgrab(bdev);
+ +
         module_put(disk->fops->owner);
         put_disk(disk);
         if (!whole)
@@@ -1448,8 -1439,6 +1452,8 @@@ static int __blkdev_put(struct block_de
   
   int blkdev_put(struct block_device *bdev, fmode_t mode)
   {
+ +      mutex_lock(&bdev->bd_mutex);
+ +
         if (mode & FMODE_EXCL) {
                 bool bdev_free;
   
@@@ -1458,6 -1447,7 +1462,6 @@@
                  * are protected with bdev_lock.  bd_mutex is to
                  * synchronize disk_holder unlinking.
                  */
- -              mutex_lock(&bdev->bd_mutex);
                 spin_lock(&bdev_lock);
   
                 WARN_ON_ONCE(--bdev->bd_holders < 0);
@@@ -1475,21 -1465,17 +1479,21 @@@
                  * If this was the last claim, remove holder link and
                  * unblock evpoll if it was a write holder.
                  */
- -              if (bdev_free) {
- -                      if (bdev->bd_write_holder) {
- -                              disk_unblock_events(bdev->bd_disk);
- -                              disk_check_events(bdev->bd_disk);
- -                              bdev->bd_write_holder = false;
- -                      }
+ +              if (bdev_free && bdev->bd_write_holder) {
+ +                      disk_unblock_events(bdev->bd_disk);
+ +                      bdev->bd_write_holder = false;
                 }
- -
- -              mutex_unlock(&bdev->bd_mutex);
         }
   
+ +      /*
+ +       * Trigger event checking and tell drivers to flush MEDIA_CHANGE
+ +       * event.  This is to ensure detection of media removal commanded
+ +       * from userland - e.g. eject(1).
+ +       */
+ +      disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
+ +
+ +      mutex_unlock(&bdev->bd_mutex);
+ +
         return __blkdev_put(bdev, mode, 0);
   }
   EXPORT_SYMBOL(blkdev_put);
diff --combined fs/ext4/inode.c

index 678cde8,8558b6c..3e5191f
--- 1/fs/ext4/inode.c
--- 2/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@@ -2634,7 -2634,7 +2634,7 @@@ static int ext4_writepage(struct page *
         struct buffer_head *page_bufs = NULL;
         struct inode *inode = page->mapping->host;
   
- -      trace_ext4_writepage(inode, page);
+ +      trace_ext4_writepage(page);
         size = i_size_read(inode);
         if (page->index == size >> PAGE_CACHE_SHIFT)
                 len = size & ~PAGE_CACHE_MASK;
@@@ -2741,7 -2741,7 +2741,7 @@@ static int write_cache_pages_da(struct 
         index = wbc->range_start >> PAGE_CACHE_SHIFT;
         end = wbc->range_end >> PAGE_CACHE_SHIFT;
   
-       if (wbc->sync_mode == WB_SYNC_ALL)
+       if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                 tag = PAGECACHE_TAG_TOWRITE;
         else
                 tag = PAGECACHE_TAG_DIRTY;
@@@ -2973,7 -2973,7 +2973,7 @@@ static int ext4_da_writepages(struct ad
         }
   
   retry:
-       if (wbc->sync_mode == WB_SYNC_ALL)
+       if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                 tag_pages_for_writeback(mapping, index, end);
   
         while (!ret && wbc->nr_to_write > 0) {
@@@ -3501,8 -3501,10 +3501,8 @@@ retry
                                  offset, nr_segs,
                                  ext4_get_block, NULL, NULL, 0);
         else {
- -              ret = blockdev_direct_IO(rw, iocb, inode,
- -                               inode->i_sb->s_bdev, iov,
- -                               offset, nr_segs,
- -                               ext4_get_block, NULL);
+ +              ret = blockdev_direct_IO(rw, iocb, inode, iov,
+ +                               offset, nr_segs, ext4_get_block);
   
                 if (unlikely((rw & WRITE) && ret < 0)) {
                         loff_t isize = i_size_read(inode);
@@@ -3573,7 -3575,6 +3573,7 @@@ static void ext4_end_io_dio(struct kioc
                             ssize_t size, void *private, int ret,
                             bool is_async)
   {
+ +      struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
           ext4_io_end_t *io_end = iocb->private;
         struct workqueue_struct *wq;
         unsigned long flags;
@@@ -3595,7 -3596,6 +3595,7 @@@
   out:
                 if (is_async)
                         aio_complete(iocb, ret, 0);
+ +              inode_dio_done(inode);
                 return;
         }
   
@@@ -3616,9 -3616,6 +3616,9 @@@
         /* queue the work to convert unwritten extents to written */
         queue_work(wq, &io_end->work);
         iocb->private = NULL;
+ +
+ +      /* XXX: probably should move into the real I/O completion handler */
+ +      inode_dio_done(inode);
   }
   
   static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
@@@ -3751,13 -3748,11 +3751,13 @@@ static ssize_t ext4_ext_direct_IO(int r
                         EXT4_I(inode)->cur_aio_dio = iocb->private;
                 }
   
- -              ret = blockdev_direct_IO(rw, iocb, inode,
+ +              ret = __blockdev_direct_IO(rw, iocb, inode,
                                          inode->i_sb->s_bdev, iov,
                                          offset, nr_segs,
                                          ext4_get_block_write,
- -                                       ext4_end_io_dio);
+ +                                       ext4_end_io_dio,
+ +                                       NULL,
+ +                                       DIO_LOCKING | DIO_SKIP_HOLES);
                 if (iocb->private)
                         EXT4_I(inode)->cur_aio_dio = NULL;
                 /*
@@@ -5356,8 -5351,6 +5356,8 @@@ int ext4_setattr(struct dentry *dentry
         }
   
         if (attr->ia_valid & ATTR_SIZE) {
+ +              inode_dio_wait(inode);
+ +
                 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   
@@@ -5850,84 -5843,80 +5850,84 @@@ int ext4_page_mkwrite(struct vm_area_st
         struct page *page = vmf->page;
         loff_t size;
         unsigned long len;
- -      int ret = -EINVAL;
- -      void *fsdata;
+ +      int ret;
         struct file *file = vma->vm_file;
         struct inode *inode = file->f_path.dentry->d_inode;
         struct address_space *mapping = inode->i_mapping;
+ +      handle_t *handle;
+ +      get_block_t *get_block;
+ +      int retries = 0;
   
         /*
- -       * Get i_alloc_sem to stop truncates messing with the inode. We cannot
- -       * get i_mutex because we are already holding mmap_sem.
+ +       * This check is racy but catches the common case. We rely on
+ +       * __block_page_mkwrite() to do a reliable check.
          */
- -      down_read(&inode->i_alloc_sem);
- -      size = i_size_read(inode);
- -      if (page->mapping != mapping || size <= page_offset(page)
- -          || !PageUptodate(page)) {
- -              /* page got truncated from under us? */
- -              goto out_unlock;
+ +      vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ +      /* Delalloc case is easy... */
+ +      if (test_opt(inode->i_sb, DELALLOC) &&
+ +          !ext4_should_journal_data(inode) &&
+ +          !ext4_nonda_switch(inode->i_sb)) {
+ +              do {
+ +                      ret = __block_page_mkwrite(vma, vmf,
+ +                                                 ext4_da_get_block_prep);
+ +              } while (ret == -ENOSPC &&
+ +                     ext4_should_retry_alloc(inode->i_sb, &retries));
+ +              goto out_ret;
         }
- -      ret = 0;
   
         lock_page(page);
- -      wait_on_page_writeback(page);
- -      if (PageMappedToDisk(page)) {
- -              up_read(&inode->i_alloc_sem);
- -              return VM_FAULT_LOCKED;
+ +      size = i_size_read(inode);
+ +      /* Page got truncated from under us? */
+ +      if (page->mapping != mapping || page_offset(page) > size) {
+ +              unlock_page(page);
+ +              ret = VM_FAULT_NOPAGE;
+ +              goto out;
         }
   
         if (page->index == size >> PAGE_CACHE_SHIFT)
                 len = size & ~PAGE_CACHE_MASK;
         else
                 len = PAGE_CACHE_SIZE;
- -
         /*
- -       * return if we have all the buffers mapped. This avoid
- -       * the need to call write_begin/write_end which does a
- -       * journal_start/journal_stop which can block and take
- -       * long time
+ +       * Return if we have all the buffers mapped. This avoids the need to do
+ +       * journal_start/journal_stop which can block and take a long time
          */
         if (page_has_buffers(page)) {
                 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
                                         ext4_bh_unmapped)) {
- -                      up_read(&inode->i_alloc_sem);
- -                      return VM_FAULT_LOCKED;
+ +                      /* Wait so that we don't change page under IO */
+ +                      wait_on_page_writeback(page);
+ +                      ret = VM_FAULT_LOCKED;
+ +                      goto out;
                 }
         }
         unlock_page(page);
- -      /*
- -       * OK, we need to fill the hole... Do write_begin write_end
- -       * to do block allocation/reservation.We are not holding
- -       * inode.i__mutex here. That allow * parallel write_begin,
- -       * write_end call. lock_page prevent this from happening
- -       * on the same page though
- -       */
- -      ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
- -                      len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
- -      if (ret < 0)
- -              goto out_unlock;
- -      ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
- -                      len, len, page, fsdata);
- -      if (ret < 0)
- -              goto out_unlock;
- -      ret = 0;
- -
- -      /*
- -       * write_begin/end might have created a dirty page and someone
- -       * could wander in and start the IO.  Make sure that hasn't
- -       * happened.
- -       */
- -      lock_page(page);
- -      wait_on_page_writeback(page);
- -      up_read(&inode->i_alloc_sem);
- -      return VM_FAULT_LOCKED;
- -out_unlock:
- -      if (ret)
+ +      /* OK, we need to fill the hole... */
+ +      if (ext4_should_dioread_nolock(inode))
+ +              get_block = ext4_get_block_write;
+ +      else
+ +              get_block = ext4_get_block;
+ +retry_alloc:
+ +      handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ +      if (IS_ERR(handle)) {
                 ret = VM_FAULT_SIGBUS;
- -      up_read(&inode->i_alloc_sem);
+ +              goto out;
+ +      }
+ +      ret = __block_page_mkwrite(vma, vmf, get_block);
+ +      if (!ret && ext4_should_journal_data(inode)) {
+ +              if (walk_page_buffers(handle, page_buffers(page), 0,
+ +                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
+ +                      unlock_page(page);
+ +                      ret = VM_FAULT_SIGBUS;
+ +                      goto out;
+ +              }
+ +              ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ +      }
+ +      ext4_journal_stop(handle);
+ +      if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ +              goto retry_alloc;
+ +out_ret:
+ +      ret = block_page_mkwrite_return(ret);
+ +out:
         return ret;
   }
diff --combined fs/fs-writeback.c

index b8c507c,6d49439..1599aa9
--- 1/fs/fs-writeback.c
--- 2/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@@ -35,7 -35,9 +35,9 @@@
   struct wb_writeback_work {
         long nr_pages;
         struct super_block *sb;
+       unsigned long *older_than_this;
         enum writeback_sync_modes sync_mode;
+       unsigned int tagged_writepages:1;
         unsigned int for_kupdate:1;
         unsigned int range_cyclic:1;
         unsigned int for_background:1;
@@@ -180,12 -182,13 +182,13 @@@ void bdi_start_background_writeback(str
    */
   void inode_wb_list_del(struct inode *inode)
   {
-       spin_lock(&inode_wb_list_lock);
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
+ 
+       spin_lock(&bdi->wb.list_lock);
         list_del_init(&inode->i_wb_list);
-       spin_unlock(&inode_wb_list_lock);
+       spin_unlock(&bdi->wb.list_lock);
   }
   
- 
   /*
    * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
    * furthest end of its superblock's dirty-inode list.
@@@ -195,11 -198,9 +198,9 @@@
    * the case then the inode must have been redirtied while it was being written
    * out and we don't reset its dirtied_when.
    */
- static void redirty_tail(struct inode *inode)
+ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
   {
-       struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
- 
-       assert_spin_locked(&inode_wb_list_lock);
+       assert_spin_locked(&wb->list_lock);
         if (!list_empty(&wb->b_dirty)) {
                 struct inode *tail;
   
@@@ -213,11 -214,9 +214,9 @@@
   /*
    * requeue inode for re-scanning after bdi->b_io list is exhausted.
    */
- static void requeue_io(struct inode *inode)
+ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
   {
-       struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
- 
-       assert_spin_locked(&inode_wb_list_lock);
+       assert_spin_locked(&wb->list_lock);
         list_move(&inode->i_wb_list, &wb->b_more_io);
   }
   
@@@ -225,7 -224,7 +224,7 @@@ static void inode_sync_complete(struct 
   {
         /*
          * Prevent speculative execution through
-        * spin_unlock(&inode_wb_list_lock);
+        * spin_unlock(&wb->list_lock);
          */
   
         smp_mb();
@@@ -250,15 -249,16 +249,16 @@@ static bool inode_dirtied_after(struct 
   /*
    * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
    */
- static void move_expired_inodes(struct list_head *delaying_queue,
+ static int move_expired_inodes(struct list_head *delaying_queue,
                                struct list_head *dispatch_queue,
-                               unsigned long *older_than_this)
+                              unsigned long *older_than_this)
   {
         LIST_HEAD(tmp);
         struct list_head *pos, *node;
         struct super_block *sb = NULL;
         struct inode *inode;
         int do_sb_sort = 0;
+       int moved = 0;
   
         while (!list_empty(delaying_queue)) {
                 inode = wb_inode(delaying_queue->prev);
@@@ -269,12 -269,13 +269,13 @@@
                         do_sb_sort = 1;
                 sb = inode->i_sb;
                 list_move(&inode->i_wb_list, &tmp);
+               moved++;
         }
   
         /* just one sb in list, splice to dispatch_queue and we're done */
         if (!do_sb_sort) {
                 list_splice(&tmp, dispatch_queue);
-               return;
+               goto out;
         }
   
         /* Move inodes from one superblock together */
@@@ -286,6 -287,8 +287,8 @@@
                                 list_move(&inode->i_wb_list, dispatch_queue);
                 }
         }
+ out:
+       return moved;
   }
   
   /*
@@@ -301,9 -304,11 +304,11 @@@
    */
   static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
   {
-       assert_spin_locked(&inode_wb_list_lock);
+       int moved;
+       assert_spin_locked(&wb->list_lock);
         list_splice_init(&wb->b_more_io, &wb->b_io);
-       move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+       moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+       trace_writeback_queue_io(wb, older_than_this, moved);
   }
   
   static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@@ -316,7 -321,8 +321,8 @@@
   /*
    * Wait for writeback on an inode to complete.
    */
- static void inode_wait_for_writeback(struct inode *inode)
+ static void inode_wait_for_writeback(struct inode *inode,
+                                    struct bdi_writeback *wb)
   {
         DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
         wait_queue_head_t *wqh;
@@@ -324,15 -330,15 +330,15 @@@
         wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
         while (inode->i_state & I_SYNC) {
                 spin_unlock(&inode->i_lock);
-               spin_unlock(&inode_wb_list_lock);
+               spin_unlock(&wb->list_lock);
                 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
-               spin_lock(&inode_wb_list_lock);
+               spin_lock(&wb->list_lock);
                 spin_lock(&inode->i_lock);
         }
   }
   
   /*
-  * Write out an inode's dirty pages.  Called under inode_wb_list_lock and
+  * Write out an inode's dirty pages.  Called under wb->list_lock and
    * inode->i_lock.  Either the caller has an active reference on the inode or
    * the inode has I_WILL_FREE set.
    *
@@@ -343,13 -349,15 +349,15 @@@
    * livelocks, etc.
    */
   static int
- writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
+                      struct writeback_control *wbc)
   {
         struct address_space *mapping = inode->i_mapping;
+       long nr_to_write = wbc->nr_to_write;
         unsigned dirty;
         int ret;
   
-       assert_spin_locked(&inode_wb_list_lock);
+       assert_spin_locked(&wb->list_lock);
         assert_spin_locked(&inode->i_lock);
   
         if (!atomic_read(&inode->i_count))
@@@ -367,14 -375,16 +375,16 @@@
                  * completed a full scan of b_io.
                  */
                 if (wbc->sync_mode != WB_SYNC_ALL) {
-                       requeue_io(inode);
+                       requeue_io(inode, wb);
+                       trace_writeback_single_inode_requeue(inode, wbc,
+                                                            nr_to_write);
                         return 0;
                 }
   
                 /*
                  * It's a data-integrity sync.  We must wait.
                  */
-               inode_wait_for_writeback(inode);
+               inode_wait_for_writeback(inode, wb);
         }
   
         BUG_ON(inode->i_state & I_SYNC);
@@@ -383,7 -393,7 +393,7 @@@
         inode->i_state |= I_SYNC;
         inode->i_state &= ~I_DIRTY_PAGES;
         spin_unlock(&inode->i_lock);
-       spin_unlock(&inode_wb_list_lock);
+       spin_unlock(&wb->list_lock);
   
         ret = do_writepages(mapping, wbc);
   
@@@ -414,10 -424,19 +424,19 @@@
                         ret = err;
         }
   
-       spin_lock(&inode_wb_list_lock);
+       spin_lock(&wb->list_lock);
         spin_lock(&inode->i_lock);
         inode->i_state &= ~I_SYNC;
         if (!(inode->i_state & I_FREEING)) {
+               /*
+                * Sync livelock prevention. Each inode is tagged and synced in
+                * one shot. If still dirty, it will be redirty_tail()'ed below.
+                * Update the dirty time to prevent enqueue and sync it again.
+                */
+               if ((inode->i_state & I_DIRTY) &&
+                   (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
+                       inode->dirtied_when = jiffies;
+ 
                 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                         /*
                          * We didn't write back all the pages.  nfs_writepages()
@@@ -428,7 -447,7 +447,7 @@@
                                 /*
                                  * slice used up: queue for next turn
                                  */
-                               requeue_io(inode);
+                               requeue_io(inode, wb);
                         } else {
                                 /*
                                  * Writeback blocked by something other than
@@@ -437,7 -456,7 +456,7 @@@
                                  * retrying writeback of the dirty page/inode
                                  * that cannot be performed immediately.
                                  */
-                               redirty_tail(inode);
+                               redirty_tail(inode, wb);
                         }
                 } else if (inode->i_state & I_DIRTY) {
                         /*
@@@ -446,7 -465,7 +465,7 @@@
                          * submission or metadata updates after data IO
                          * completion.
                          */
-                       redirty_tail(inode);
+                       redirty_tail(inode, wb);
                 } else {
                         /*
                          * The inode is clean.  At this point we either have
@@@ -457,9 -476,67 +476,41 @@@
                 }
         }
         inode_sync_complete(inode);
+       trace_writeback_single_inode(inode, wbc, nr_to_write);
         return ret;
   }
   
- -/*
- - * For background writeback the caller does not have the sb pinned
- - * before calling writeback. So make sure that we do pin it, so it doesn't
- - * go away while we are writing inodes from it.
- - */
- -static bool pin_sb_for_writeback(struct super_block *sb)
- -{
- -      spin_lock(&sb_lock);
- -      if (list_empty(&sb->s_instances)) {
- -              spin_unlock(&sb_lock);
- -              return false;
- -      }
- -
- -      sb->s_count++;
- -      spin_unlock(&sb_lock);
- -
- -      if (down_read_trylock(&sb->s_umount)) {
- -              if (sb->s_root)
- -                      return true;
- -              up_read(&sb->s_umount);
- -      }
- -
- -      put_super(sb);
- -      return false;
- -}
- -
+ static long writeback_chunk_size(struct backing_dev_info *bdi,
+                                struct wb_writeback_work *work)
+ {
+       long pages;
+ 
+       /*
+        * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+        * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+        * here avoids calling into writeback_inodes_wb() more than once.
+        *
+        * The intended call sequence for WB_SYNC_ALL writeback is:
+        *
+        *      wb_writeback()
+        *          writeback_sb_inodes()       <== called only once
+        *              write_cache_pages()     <== called once for each inode
+        *                   (quickly) tag currently dirty pages
+        *                   (maybe slowly) sync all tagged pages
+        */
+       if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
+               pages = LONG_MAX;
+       else {
+               pages = min(bdi->avg_write_bandwidth / 2,
+                           global_dirty_limit / DIRTY_SCOPE);
+               pages = min(pages, work->nr_pages);
+               pages = round_down(pages + MIN_WRITEBACK_PAGES,
+                                  MIN_WRITEBACK_PAGES);
+       }
+ 
+       return pages;
+ }
+ 
   /*
    * Write a portion of b_io inodes which belong to @sb.
    *
@@@ -467,24 -544,36 +518,36 @@@
    * inodes. Otherwise write only ones which go sequentially
    * in reverse order.
    *
-  * Return 1, if the caller writeback routine should be
-  * interrupted. Otherwise return 0.
+  * Return the number of pages and/or inodes written.
    */
- static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
-               struct writeback_control *wbc, bool only_this_sb)
+ static long writeback_sb_inodes(struct super_block *sb,
+                               struct bdi_writeback *wb,
+                               struct wb_writeback_work *work)
   {
+       struct writeback_control wbc = {
+               .sync_mode              = work->sync_mode,
+               .tagged_writepages      = work->tagged_writepages,
+               .for_kupdate            = work->for_kupdate,
+               .for_background         = work->for_background,
+               .range_cyclic           = work->range_cyclic,
+               .range_start            = 0,
+               .range_end              = LLONG_MAX,
+       };
+       unsigned long start_time = jiffies;
+       long write_chunk;
+       long wrote = 0;  /* count both pages and inodes */
+ 
         while (!list_empty(&wb->b_io)) {
-               long pages_skipped;
                 struct inode *inode = wb_inode(wb->b_io.prev);
   
                 if (inode->i_sb != sb) {
-                       if (only_this_sb) {
+                       if (work->sb) {
                                 /*
                                  * We only want to write back data for this
                                  * superblock, move all inodes not belonging
                                  * to it back onto the dirty list.
                                  */
-                               redirty_tail(inode);
+                               redirty_tail(inode, wb);
                                 continue;
                         }
   
@@@ -493,7 -582,7 +556,7 @@@
                          * Bounce back to the caller to unpin this and
                          * pin the next superblock.
                          */
-                       return 0;
+                       break;
                 }
   
                 /*
@@@ -504,95 -593,91 +567,91 @@@
                 spin_lock(&inode->i_lock);
                 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                         spin_unlock(&inode->i_lock);
-                       requeue_io(inode);
+                       redirty_tail(inode, wb);
                         continue;
                 }
- 
-               /*
-                * Was this inode dirtied after sync_sb_inodes was called?
-                * This keeps sync from extra jobs and livelock.
-                */
-               if (inode_dirtied_after(inode, wbc->wb_start)) {
-                       spin_unlock(&inode->i_lock);
-                       return 1;
-               }
- 
                 __iget(inode);
+               write_chunk = writeback_chunk_size(wb->bdi, work);
+               wbc.nr_to_write = write_chunk;
+               wbc.pages_skipped = 0;
   
-               pages_skipped = wbc->pages_skipped;
-               writeback_single_inode(inode, wbc);
-               if (wbc->pages_skipped != pages_skipped) {
+               writeback_single_inode(inode, wb, &wbc);
+ 
+               work->nr_pages -= write_chunk - wbc.nr_to_write;
+               wrote += write_chunk - wbc.nr_to_write;
+               if (!(inode->i_state & I_DIRTY))
+                       wrote++;
+               if (wbc.pages_skipped) {
                         /*
                          * writeback is not making progress due to locked
                          * buffers.  Skip this inode for now.
                          */
-                       redirty_tail(inode);
+                       redirty_tail(inode, wb);
                 }
                 spin_unlock(&inode->i_lock);
-               spin_unlock(&inode_wb_list_lock);
+               spin_unlock(&wb->list_lock);
                 iput(inode);
                 cond_resched();
-               spin_lock(&inode_wb_list_lock);
-               if (wbc->nr_to_write <= 0) {
-                       wbc->more_io = 1;
-                       return 1;
+               spin_lock(&wb->list_lock);
+               /*
+                * bail out to wb_writeback() often enough to check
+                * background threshold and other termination conditions.
+                */
+               if (wrote) {
+                       if (time_is_before_jiffies(start_time + HZ / 10UL))
+                               break;
+                       if (work->nr_pages <= 0)
+                               break;
                 }
-               if (!list_empty(&wb->b_more_io))
-                       wbc->more_io = 1;
         }
-       /* b_io is empty */
-       return 1;
+       return wrote;
   }
   
- void writeback_inodes_wb(struct bdi_writeback *wb,
-               struct writeback_control *wbc)
+ static long __writeback_inodes_wb(struct bdi_writeback *wb,
+                                 struct wb_writeback_work *work)
   {
-       int ret = 0;
- 
-       if (!wbc->wb_start)
-               wbc->wb_start = jiffies; /* livelock avoidance */
-       spin_lock(&inode_wb_list_lock);
-       if (!wbc->for_kupdate || list_empty(&wb->b_io))
-               queue_io(wb, wbc->older_than_this);
+       unsigned long start_time = jiffies;
+       long wrote = 0;
   
         while (!list_empty(&wb->b_io)) {
                 struct inode *inode = wb_inode(wb->b_io.prev);
                 struct super_block *sb = inode->i_sb;
   
- -              if (!pin_sb_for_writeback(sb)) {
+ +              if (!grab_super_passive(sb)) {
-                       requeue_io(inode);
+                       requeue_io(inode, wb);
                         continue;
                 }
-               ret = writeback_sb_inodes(sb, wb, wbc, false);
+               wrote += writeback_sb_inodes(sb, wb, work);
                 drop_super(sb);
   
-               if (ret)
-                       break;
+               /* refer to the same tests at the end of writeback_sb_inodes */
+               if (wrote) {
+                       if (time_is_before_jiffies(start_time + HZ / 10UL))
+                               break;
+                       if (work->nr_pages <= 0)
+                               break;
+               }
         }
-       spin_unlock(&inode_wb_list_lock);
         /* Leave any unwritten inodes on b_io */
+       return wrote;
   }
   
- static void __writeback_inodes_sb(struct super_block *sb,
-               struct bdi_writeback *wb, struct writeback_control *wbc)
+ long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
   {
-       WARN_ON(!rwsem_is_locked(&sb->s_umount));
+       struct wb_writeback_work work = {
+               .nr_pages       = nr_pages,
+               .sync_mode      = WB_SYNC_NONE,
+               .range_cyclic   = 1,
+       };
   
-       spin_lock(&inode_wb_list_lock);
-       if (!wbc->for_kupdate || list_empty(&wb->b_io))
-               queue_io(wb, wbc->older_than_this);
-       writeback_sb_inodes(sb, wb, wbc, true);
-       spin_unlock(&inode_wb_list_lock);
- }
+       spin_lock(&wb->list_lock);
+       if (list_empty(&wb->b_io))
+               queue_io(wb, NULL);
+       __writeback_inodes_wb(wb, &work);
+       spin_unlock(&wb->list_lock);
   
- /*
-  * The maximum number of pages to writeout in a single bdi flush/kupdate
-  * operation.  We do this so we don't hold I_SYNC against an inode for
-  * enormous amounts of time, which would block a userspace task which has
-  * been forced to throttle against that inode.  Also, the code reevaluates
-  * the dirty each time it has written this many pages.
-  */
- #define MAX_WRITEBACK_PAGES     1024
+       return nr_pages - work.nr_pages;
+ }
   
   static inline bool over_bground_thresh(void)
   {
@@@ -604,6 -689,16 +663,16 @@@
                 global_page_state(NR_UNSTABLE_NFS) > background_thresh);
   }
   
+ /*
+  * Called under wb->list_lock. If there are multiple wb per bdi,
+  * only the flusher working on the first wb should do it.
+  */
+ static void wb_update_bandwidth(struct bdi_writeback *wb,
+                               unsigned long start_time)
+ {
+       __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
+ }
+ 
   /*
    * Explicit flushing or periodic writeback of "old" data.
    *
@@@ -622,47 -717,16 +691,16 @@@
   static long wb_writeback(struct bdi_writeback *wb,
                          struct wb_writeback_work *work)
   {
-       struct writeback_control wbc = {
-               .sync_mode              = work->sync_mode,
-               .older_than_this        = NULL,
-               .for_kupdate            = work->for_kupdate,
-               .for_background         = work->for_background,
-               .range_cyclic           = work->range_cyclic,
-       };
+       unsigned long wb_start = jiffies;
+       long nr_pages = work->nr_pages;
         unsigned long oldest_jif;
-       long wrote = 0;
-       long write_chunk;
         struct inode *inode;
+       long progress;
   
-       if (wbc.for_kupdate) {
-               wbc.older_than_this = &oldest_jif;
-               oldest_jif = jiffies -
-                               msecs_to_jiffies(dirty_expire_interval * 10);
-       }
-       if (!wbc.range_cyclic) {
-               wbc.range_start = 0;
-               wbc.range_end = LLONG_MAX;
-       }
+       oldest_jif = jiffies;
+       work->older_than_this = &oldest_jif;
   
-       /*
-        * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
-        * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
-        * here avoids calling into writeback_inodes_wb() more than once.
-        *
-        * The intended call sequence for WB_SYNC_ALL writeback is:
-        *
-        *      wb_writeback()
-        *          __writeback_inodes_sb()     <== called only once
-        *              write_cache_pages()     <== called once for each inode
-        *                   (quickly) tag currently dirty pages
-        *                   (maybe slowly) sync all tagged pages
-        */
-       if (wbc.sync_mode == WB_SYNC_NONE)
-               write_chunk = MAX_WRITEBACK_PAGES;
-       else
-               write_chunk = LONG_MAX;
- 
-       wbc.wb_start = jiffies; /* livelock avoidance */
+       spin_lock(&wb->list_lock);
         for (;;) {
                 /*
                  * Stop writeback when nr_pages has been consumed
@@@ -687,52 -751,54 +725,54 @@@
                 if (work->for_background && !over_bground_thresh())
                         break;
   
-               wbc.more_io = 0;
-               wbc.nr_to_write = write_chunk;
-               wbc.pages_skipped = 0;
+               if (work->for_kupdate) {
+                       oldest_jif = jiffies -
+                               msecs_to_jiffies(dirty_expire_interval * 10);
+                       work->older_than_this = &oldest_jif;
+               }
   
-               trace_wbc_writeback_start(&wbc, wb->bdi);
+               trace_writeback_start(wb->bdi, work);
+               if (list_empty(&wb->b_io))
+                       queue_io(wb, work->older_than_this);
                 if (work->sb)
-                       __writeback_inodes_sb(work->sb, wb, &wbc);
+                       progress = writeback_sb_inodes(work->sb, wb, work);
                 else
-                       writeback_inodes_wb(wb, &wbc);
-               trace_wbc_writeback_written(&wbc, wb->bdi);
+                       progress = __writeback_inodes_wb(wb, work);
+               trace_writeback_written(wb->bdi, work);
   
-               work->nr_pages -= write_chunk - wbc.nr_to_write;
-               wrote += write_chunk - wbc.nr_to_write;
+               wb_update_bandwidth(wb, wb_start);
   
                 /*
-                * If we consumed everything, see if we have more
+                * Did we write something? Try for more
+                *
+                * Dirty inodes are moved to b_io for writeback in batches.
+                * The completion of the current batch does not necessarily
+                * mean the overall work is done. So we keep looping as long
+                * as made some progress on cleaning pages or inodes.
                  */
-               if (wbc.nr_to_write <= 0)
+               if (progress)
                         continue;
                 /*
-                * Didn't write everything and we don't have more IO, bail
+                * No more inodes for IO, bail
                  */
-               if (!wbc.more_io)
+               if (list_empty(&wb->b_more_io))
                         break;
-               /*
-                * Did we write something? Try for more
-                */
-               if (wbc.nr_to_write < write_chunk)
-                       continue;
                 /*
                  * Nothing written. Wait for some inode to
                  * become available for writeback. Otherwise
                  * we'll just busyloop.
                  */
-               spin_lock(&inode_wb_list_lock);
                 if (!list_empty(&wb->b_more_io))  {
+                       trace_writeback_wait(wb->bdi, work);
                         inode = wb_inode(wb->b_more_io.prev);
-                       trace_wbc_writeback_wait(&wbc, wb->bdi);
                         spin_lock(&inode->i_lock);
-                       inode_wait_for_writeback(inode);
+                       inode_wait_for_writeback(inode, wb);
                         spin_unlock(&inode->i_lock);
                 }
-               spin_unlock(&inode_wb_list_lock);
         }
+       spin_unlock(&wb->list_lock);
   
-       return wrote;
+       return nr_pages - work->nr_pages;
   }
   
   /*
@@@ -1063,10 -1129,10 +1103,10 @@@ void __mark_inode_dirty(struct inode *i
                         }
   
                         spin_unlock(&inode->i_lock);
-                       spin_lock(&inode_wb_list_lock);
+                       spin_lock(&bdi->wb.list_lock);
                         inode->dirtied_when = jiffies;
                         list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
-                       spin_unlock(&inode_wb_list_lock);
+                       spin_unlock(&bdi->wb.list_lock);
   
                         if (wakeup_bdi)
                                 bdi_wakeup_thread_delayed(bdi);
@@@ -1162,10 -1228,11 +1202,11 @@@ void writeback_inodes_sb_nr(struct supe
   {
         DECLARE_COMPLETION_ONSTACK(done);
         struct wb_writeback_work work = {
-               .sb             = sb,
-               .sync_mode      = WB_SYNC_NONE,
-               .done           = &done,
-               .nr_pages       = nr,
+               .sb                     = sb,
+               .sync_mode              = WB_SYNC_NONE,
+               .tagged_writepages      = 1,
+               .done                   = &done,
+               .nr_pages               = nr,
         };
   
         WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@@ -1267,6 -1334,7 +1308,7 @@@ EXPORT_SYMBOL(sync_inodes_sb)
    */
   int write_inode_now(struct inode *inode, int sync)
   {
+       struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
         int ret;
         struct writeback_control wbc = {
                 .nr_to_write = LONG_MAX,
@@@ -1279,11 -1347,11 +1321,11 @@@
                 wbc.nr_to_write = 0;
   
         might_sleep();
-       spin_lock(&inode_wb_list_lock);
+       spin_lock(&wb->list_lock);
         spin_lock(&inode->i_lock);
-       ret = writeback_single_inode(inode, &wbc);
+       ret = writeback_single_inode(inode, wb, &wbc);
         spin_unlock(&inode->i_lock);
-       spin_unlock(&inode_wb_list_lock);
+       spin_unlock(&wb->list_lock);
         if (sync)
                 inode_sync_wait(inode);
         return ret;
@@@ -1303,13 -1371,14 +1345,14 @@@ EXPORT_SYMBOL(write_inode_now)
    */
   int sync_inode(struct inode *inode, struct writeback_control *wbc)
   {
+       struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
         int ret;
   
-       spin_lock(&inode_wb_list_lock);
+       spin_lock(&wb->list_lock);
         spin_lock(&inode->i_lock);
-       ret = writeback_single_inode(inode, wbc);
+       ret = writeback_single_inode(inode, wb, wbc);
         spin_unlock(&inode->i_lock);
-       spin_unlock(&inode_wb_list_lock);
+       spin_unlock(&wb->list_lock);
         return ret;
   }
   EXPORT_SYMBOL(sync_inode);
diff --combined fs/inode.c

index 96c77b8,4be128c..a48fa53
--- 1/fs/inode.c
--- 2/fs/inode.c
+++ b/fs/inode.c
@@@ -33,11 -33,11 +33,11 @@@
    *
    * inode->i_lock protects:
    *   inode->i_state, inode->i_hash, __iget()
- - * inode_lru_lock protects:
- - *   inode_lru, inode->i_lru
+ + * inode->i_sb->s_inode_lru_lock protects:
+ + *   inode->i_sb->s_inode_lru, inode->i_lru
    * inode_sb_list_lock protects:
    *   sb->s_inodes, inode->i_sb_list
-  * inode_wb_list_lock protects:
+  * bdi->wb.list_lock protects:
    *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
    * inode_hash_lock protects:
    *   inode_hashtable, inode->i_hash
@@@ -46,9 -46,9 +46,9 @@@
    *
    * inode_sb_list_lock
    *   inode->i_lock
- - *     inode_lru_lock
+ + *     inode->i_sb->s_inode_lru_lock
    *
-  * inode_wb_list_lock
+  * bdi->wb.list_lock
    *   inode->i_lock
    *
    * inode_hash_lock
@@@ -64,9 -64,22 +64,8 @@@ static unsigned int i_hash_shift __read
   static struct hlist_head *inode_hashtable __read_mostly;
   static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
   
- -static LIST_HEAD(inode_lru);
- -static DEFINE_SPINLOCK(inode_lru_lock);
- -
   __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
   
- -/*
- - * iprune_sem provides exclusion between the icache shrinking and the
- - * umount path.
- - *
- - * We don't actually need it to protect anything in the umount path,
- - * but only need to cycle through it to make sure any inode that
- - * prune_icache took off the LRU list has been fully torn down by the
- - * time we are past evict_inodes.
- - */
- -static DECLARE_RWSEM(iprune_sem);
- -
   /*
    * Empty aops. Can be used for the cases where the user does not
    * define any of the address_space operations.
@@@ -81,7 -94,6 +80,7 @@@ EXPORT_SYMBOL(empty_aops)
   struct inodes_stat_t inodes_stat;
   
   static DEFINE_PER_CPU(unsigned int, nr_inodes);
+ +static DEFINE_PER_CPU(unsigned int, nr_unused);
   
   static struct kmem_cache *inode_cachep __read_mostly;
   
@@@ -96,11 -108,7 +95,11 @@@ static int get_nr_inodes(void
   
   static inline int get_nr_inodes_unused(void)
   {
- -      return inodes_stat.nr_unused;
+ +      int i;
+ +      int sum = 0;
+ +      for_each_possible_cpu(i)
+ +              sum += per_cpu(nr_unused, i);
+ +      return sum < 0 ? 0 : sum;
   }
   
   int get_nr_dirty_inodes(void)
@@@ -118,7 -126,6 +117,7 @@@ int proc_nr_inodes(ctl_table *table, in
                    void __user *buffer, size_t *lenp, loff_t *ppos)
   {
         inodes_stat.nr_inodes = get_nr_inodes();
+ +      inodes_stat.nr_unused = get_nr_inodes_unused();
         return proc_dointvec(table, write, buffer, lenp, ppos);
   }
   #endif
@@@ -168,7 -175,8 +167,7 @@@ int inode_init_always(struct super_bloc
         mutex_init(&inode->i_mutex);
         lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
   
- -      init_rwsem(&inode->i_alloc_sem);
- -      lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
+ +      atomic_set(&inode->i_dio_count, 0);
   
         mapping->a_ops = &empty_aops;
         mapping->host = inode;
@@@ -328,24 -336,22 +327,24 @@@ EXPORT_SYMBOL(ihold)
   
   static void inode_lru_list_add(struct inode *inode)
   {
- -      spin_lock(&inode_lru_lock);
+ +      spin_lock(&inode->i_sb->s_inode_lru_lock);
         if (list_empty(&inode->i_lru)) {
- -              list_add(&inode->i_lru, &inode_lru);
- -              inodes_stat.nr_unused++;
+ +              list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
+ +              inode->i_sb->s_nr_inodes_unused++;
+ +              this_cpu_inc(nr_unused);
         }
- -      spin_unlock(&inode_lru_lock);
+ +      spin_unlock(&inode->i_sb->s_inode_lru_lock);
   }
   
   static void inode_lru_list_del(struct inode *inode)
   {
- -      spin_lock(&inode_lru_lock);
+ +      spin_lock(&inode->i_sb->s_inode_lru_lock);
         if (!list_empty(&inode->i_lru)) {
                 list_del_init(&inode->i_lru);
- -              inodes_stat.nr_unused--;
+ +              inode->i_sb->s_nr_inodes_unused--;
+ +              this_cpu_dec(nr_unused);
         }
- -      spin_unlock(&inode_lru_lock);
+ +      spin_unlock(&inode->i_sb->s_inode_lru_lock);
   }
   
   /**
@@@ -416,14 -422,7 +415,14 @@@ EXPORT_SYMBOL(remove_inode_hash)
   void end_writeback(struct inode *inode)
   {
         might_sleep();
+ +      /*
+ +       * We have to cycle tree_lock here because reclaim can be still in the
+ +       * process of removing the last page (in __delete_from_page_cache())
+ +       * and we must not free mapping under it.
+ +       */
+ +      spin_lock_irq(&inode->i_data.tree_lock);
         BUG_ON(inode->i_data.nrpages);
+ +      spin_unlock_irq(&inode->i_data.tree_lock);
         BUG_ON(!list_empty(&inode->i_data.private_list));
         BUG_ON(!(inode->i_state & I_FREEING));
         BUG_ON(inode->i_state & I_CLEAR);
@@@ -530,6 -529,14 +529,6 @@@ void evict_inodes(struct super_block *s
         spin_unlock(&inode_sb_list_lock);
   
         dispose_list(&dispose);
- -
- -      /*
- -       * Cycle through iprune_sem to make sure any inode that prune_icache
- -       * moved off the list before we took the lock has been fully torn
- -       * down.
- -       */
- -      down_write(&iprune_sem);
- -      up_write(&iprune_sem);
   }
   
   /**
@@@ -592,10 -599,8 +591,10 @@@ static int can_unuse(struct inode *inod
   }
   
   /*
- - * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- - * temporary list and then are freed outside inode_lru_lock by dispose_list().
+ + * Walk the superblock inode LRU for freeable inodes and attempt to free them.
+ + * This is called from the superblock shrinker function with a number of inodes
+ + * to trim from the LRU. Inodes to be freed are moved to a temporary list and
+ + * then are freed outside inode_lock by dispose_list().
    *
    * Any inodes which are pinned purely because of attached pagecache have their
    * pagecache removed.  If the inode has metadata buffers attached to
@@@ -609,28 -614,29 +608,28 @@@
    * LRU does not have strict ordering. Hence we don't want to reclaim inodes
    * with this flag set because they are the inodes that are out of order.
    */
- -static void prune_icache(int nr_to_scan)
+ +void prune_icache_sb(struct super_block *sb, int nr_to_scan)
   {
         LIST_HEAD(freeable);
         int nr_scanned;
         unsigned long reap = 0;
   
- -      down_read(&iprune_sem);
- -      spin_lock(&inode_lru_lock);
- -      for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+ +      spin_lock(&sb->s_inode_lru_lock);
+ +      for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
                 struct inode *inode;
   
- -              if (list_empty(&inode_lru))
+ +              if (list_empty(&sb->s_inode_lru))
                         break;
   
- -              inode = list_entry(inode_lru.prev, struct inode, i_lru);
+ +              inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
   
                 /*
- -               * we are inverting the inode_lru_lock/inode->i_lock here,
+ +               * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
                  * so use a trylock. If we fail to get the lock, just move the
                  * inode to the back of the list so we don't spin on it.
                  */
                 if (!spin_trylock(&inode->i_lock)) {
- -                      list_move(&inode->i_lru, &inode_lru);
+ +                      list_move(&inode->i_lru, &sb->s_inode_lru);
                         continue;
                 }
   
@@@ -642,29 -648,28 +641,29 @@@
                     (inode->i_state & ~I_REFERENCED)) {
                         list_del_init(&inode->i_lru);
                         spin_unlock(&inode->i_lock);
- -                      inodes_stat.nr_unused--;
+ +                      sb->s_nr_inodes_unused--;
+ +                      this_cpu_dec(nr_unused);
                         continue;
                 }
   
                 /* recently referenced inodes get one more pass */
                 if (inode->i_state & I_REFERENCED) {
                         inode->i_state &= ~I_REFERENCED;
- -                      list_move(&inode->i_lru, &inode_lru);
+ +                      list_move(&inode->i_lru, &sb->s_inode_lru);
                         spin_unlock(&inode->i_lock);
                         continue;
                 }
                 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
                         __iget(inode);
                         spin_unlock(&inode->i_lock);
- -                      spin_unlock(&inode_lru_lock);
+ +                      spin_unlock(&sb->s_inode_lru_lock);
                         if (remove_inode_buffers(inode))
                                 reap += invalidate_mapping_pages(&inode->i_data,
                                                                 0, -1);
                         iput(inode);
- -                      spin_lock(&inode_lru_lock);
+ +                      spin_lock(&sb->s_inode_lru_lock);
   
- -                      if (inode != list_entry(inode_lru.next,
+ +                      if (inode != list_entry(sb->s_inode_lru.next,
                                                 struct inode, i_lru))
                                 continue;       /* wrong inode or list_empty */
                         /* avoid lock inversions with trylock */
@@@ -680,18 -685,51 +679,18 @@@
                 spin_unlock(&inode->i_lock);
   
                 list_move(&inode->i_lru, &freeable);
- -              inodes_stat.nr_unused--;
+ +              sb->s_nr_inodes_unused--;
+ +              this_cpu_dec(nr_unused);
         }
         if (current_is_kswapd())
                 __count_vm_events(KSWAPD_INODESTEAL, reap);
         else
                 __count_vm_events(PGINODESTEAL, reap);
- -      spin_unlock(&inode_lru_lock);
+ +      spin_unlock(&sb->s_inode_lru_lock);
   
         dispose_list(&freeable);
- -      up_read(&iprune_sem);
   }
   
- -/*
- - * shrink_icache_memory() will attempt to reclaim some unused inodes.  Here,
- - * "unused" means that no dentries are referring to the inodes: the files are
- - * not open and the dcache references to those inodes have already been
- - * reclaimed.
- - *
- - * This function is passed the number of inodes to scan, and it returns the
- - * total number of remaining possibly-reclaimable inodes.
- - */
- -static int shrink_icache_memory(struct shrinker *shrink,
- -                              struct shrink_control *sc)
- -{
- -      int nr = sc->nr_to_scan;
- -      gfp_t gfp_mask = sc->gfp_mask;
- -
- -      if (nr) {
- -              /*
- -               * Nasty deadlock avoidance.  We may hold various FS locks,
- -               * and we don't want to recurse into the FS that called us
- -               * in clear_inode() and friends..
- -               */
- -              if (!(gfp_mask & __GFP_FS))
- -                      return -1;
- -              prune_icache(nr);
- -      }
- -      return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
- -}
- -
- -static struct shrinker icache_shrinker = {
- -      .shrink = shrink_icache_memory,
- -      .seeks = DEFAULT_SEEKS,
- -};
- -
   static void __wait_on_freeing_inode(struct inode *inode);
   /*
    * Called with the inode lock held.
@@@ -1285,7 -1323,7 +1284,7 @@@ static void iput_final(struct inode *in
   
         WARN_ON(inode->i_state & I_NEW);
   
- -      if (op && op->drop_inode)
+ +      if (op->drop_inode)
                 drop = op->drop_inode(inode);
         else
                 drop = generic_drop_inode(inode);
@@@ -1571,6 -1609,7 +1570,6 @@@ void __init inode_init(void
                                          (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
                                          SLAB_MEM_SPREAD),
                                          init_once);
- -      register_shrinker(&icache_shrinker);
   
         /* Hash may have been set up in inode_init_early */
         if (!hashdist)
diff --combined fs/nfs/write.c

index 0857931,dd6a6ce..00e3750
--- 1/fs/nfs/write.c
--- 2/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@@ -409,7 -409,7 +409,7 @@@ out
    */
   static void nfs_inode_remove_request(struct nfs_page *req)
   {
- -      struct inode *inode = req->wb_context->path.dentry->d_inode;
+ +      struct inode *inode = req->wb_context->dentry->d_inode;
         struct nfs_inode *nfsi = NFS_I(inode);
   
         BUG_ON (!NFS_WBACK_BUSY(req));
@@@ -438,7 -438,7 +438,7 @@@ nfs_mark_request_dirty(struct nfs_page 
   static void
   nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
   {
- -      struct inode *inode = req->wb_context->path.dentry->d_inode;
+ +      struct inode *inode = req->wb_context->dentry->d_inode;
         struct nfs_inode *nfsi = NFS_I(inode);
   
         spin_lock(&inode->i_lock);
@@@ -852,20 -852,18 +852,20 @@@ static int nfs_write_rpcsetup(struct nf
                 struct pnfs_layout_segment *lseg,
                 int how)
   {
- -      struct inode *inode = req->wb_context->path.dentry->d_inode;
+ +      struct inode *inode = req->wb_context->dentry->d_inode;
   
         /* Set up the RPC argument and reply structs
          * NB: take care not to mess about with data->commit et al. */
   
         data->req = req;
- -      data->inode = inode = req->wb_context->path.dentry->d_inode;
+ +      data->inode = inode = req->wb_context->dentry->d_inode;
         data->cred = req->wb_context->cred;
         data->lseg = get_lseg(lseg);
   
         data->args.fh     = NFS_FH(inode);
         data->args.offset = req_offset(req) + offset;
+ +      /* pnfs_set_layoutcommit needs this */
+ +      data->mds_offset = data->args.offset;
         data->args.pgbase = req->wb_pgbase + offset;
         data->args.pages  = data->pagevec;
         data->args.count  = count;
@@@ -1053,9 -1051,9 +1053,9 @@@ static void nfs_writeback_done_partial(
   
         dprintk("NFS: %5u write(%s/%lld %d@%lld)",
                 task->tk_pid,
- -              data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
+ +              data->req->wb_context->dentry->d_inode->i_sb->s_id,
                 (long long)
- -                NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
+ +                NFS_FILEID(data->req->wb_context->dentry->d_inode),
                 data->req->wb_bytes, (long long)req_offset(data->req));
   
         nfs_writeback_done(task, data);
@@@ -1148,8 -1146,8 +1148,8 @@@ static void nfs_writeback_release_full(
   
                 dprintk("NFS: %5u write (%s/%lld %d@%lld)",
                         data->task.tk_pid,
- -                      req->wb_context->path.dentry->d_inode->i_sb->s_id,
- -                      (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
+ +                      req->wb_context->dentry->d_inode->i_sb->s_id,
+ +                      (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
                         req->wb_bytes,
                         (long long)req_offset(req));
   
@@@ -1347,7 -1345,7 +1347,7 @@@ void nfs_init_commit(struct nfs_write_d
                             struct pnfs_layout_segment *lseg)
   {
         struct nfs_page *first = nfs_list_entry(head->next);
- -      struct inode *inode = first->wb_context->path.dentry->d_inode;
+ +      struct inode *inode = first->wb_context->dentry->d_inode;
   
         /* Set up the RPC argument and reply structs
          * NB: take care not to mess about with data->commit et al. */
@@@ -1435,8 -1433,8 +1435,8 @@@ void nfs_commit_release_pages(struct nf
                 nfs_clear_request_commit(req);
   
                 dprintk("NFS:       commit (%s/%lld %d@%lld)",
- -                      req->wb_context->path.dentry->d_inode->i_sb->s_id,
- -                      (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
+ +                      req->wb_context->dentry->d_sb->s_id,
+ +                      (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
                         req->wb_bytes,
                         (long long)req_offset(req));
                 if (status < 0) {
@@@ -1566,8 -1564,7 +1566,7 @@@ int nfs_write_inode(struct inode *inode
                 int status;
                 bool sync = true;
   
-               if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
-                   wbc->for_background)
+               if (wbc->sync_mode == WB_SYNC_NONE)
                         sync = false;
   
                 status = pnfs_layoutcommit_inode(inode, sync);
diff --combined include/trace/events/ext4.h

index 5ce2b2f,b225d0d..6363193
--- 1/include/trace/events/ext4.h
--- 2/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@@ -26,7 -26,7 +26,7 @@@ TRACE_EVENT(ext4_free_inode
                 __field(        umode_t, mode                   )
                 __field(        uid_t,  uid                     )
                 __field(        gid_t,  gid                     )
- -              __field(        blkcnt_t, blocks                )
+ +              __field(        __u64, blocks                   )
         ),
   
         TP_fast_assign(
@@@ -40,8 -40,9 +40,8 @@@
   
         TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
- -                (unsigned long) __entry->ino,
- -                __entry->mode, __entry->uid, __entry->gid,
- -                (unsigned long long) __entry->blocks)
+ +                (unsigned long) __entry->ino, __entry->mode,
+ +                __entry->uid, __entry->gid, __entry->blocks)
   );
   
   TRACE_EVENT(ext4_request_inode,
@@@ -177,7 -178,7 +177,7 @@@ TRACE_EVENT(ext4_begin_ordered_truncate
         TP_printk("dev %d,%d ino %lu new_size %lld",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
- -                (long long) __entry->new_size)
+ +                __entry->new_size)
   );
   
   DECLARE_EVENT_CLASS(ext4__write_begin,
@@@ -203,7 -204,7 +203,7 @@@
                 __entry->flags  = flags;
         ),
   
- -      TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u",
+ +      TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
                   __entry->pos, __entry->len, __entry->flags)
@@@ -247,7 -248,7 +247,7 @@@ DECLARE_EVENT_CLASS(ext4__write_end
                 __entry->copied = copied;
         ),
   
- -      TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u",
+ +      TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
                   __entry->pos, __entry->len, __entry->copied)
@@@ -285,6 -286,29 +285,6 @@@ DEFINE_EVENT(ext4__write_end, ext4_da_w
         TP_ARGS(inode, pos, len, copied)
   );
   
- -TRACE_EVENT(ext4_writepage,
- -      TP_PROTO(struct inode *inode, struct page *page),
- -
- -      TP_ARGS(inode, page),
- -
- -      TP_STRUCT__entry(
- -              __field(        dev_t,  dev                     )
- -              __field(        ino_t,  ino                     )
- -              __field(        pgoff_t, index                  )
- -
- -      ),
- -
- -      TP_fast_assign(
- -              __entry->dev    = inode->i_sb->s_dev;
- -              __entry->ino    = inode->i_ino;
- -              __entry->index  = page->index;
- -      ),
- -
- -      TP_printk("dev %d,%d ino %lu page_index %lu",
- -                MAJOR(__entry->dev), MINOR(__entry->dev),
- -                (unsigned long) __entry->ino, __entry->index)
- -);
- -
   TRACE_EVENT(ext4_da_writepages,
         TP_PROTO(struct inode *inode, struct writeback_control *wbc),
   
@@@ -317,7 -341,7 +317,7 @@@
         ),
   
         TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
- -                "range_start %llu range_end %llu sync_mode %d"
+ +                "range_start %lld range_end %lld sync_mode %d"
                   "for_kupdate %d range_cyclic %d writeback_index %lu",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino, __entry->nr_to_write,
@@@ -380,7 -404,6 +380,6 @@@ TRACE_EVENT(ext4_da_writepages_result
                 __field(        int,    pages_written           )
                 __field(        long,   pages_skipped           )
                 __field(        int,    sync_mode               )
-               __field(        char,   more_io                 )       
                 __field(       pgoff_t, writeback_index         )
         ),
   
@@@ -391,16 -414,15 +390,15 @@@
                 __entry->pages_written  = pages_written;
                 __entry->pages_skipped  = wbc->pages_skipped;
                 __entry->sync_mode      = wbc->sync_mode;
-               __entry->more_io        = wbc->more_io;
                 __entry->writeback_index = inode->i_mapping->writeback_index;
         ),
   
         TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
-                 " more_io %d sync_mode %d writeback_index %lu",
+                 "sync_mode %d writeback_index %lu",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino, __entry->ret,
                   __entry->pages_written, __entry->pages_skipped,
-                 __entry->more_io, __entry->sync_mode,
+                 __entry->sync_mode,
                   (unsigned long) __entry->writeback_index)
   );
   
@@@ -425,14 -447,7 +423,14 @@@ DECLARE_EVENT_CLASS(ext4__page_op
         TP_printk("dev %d,%d ino %lu page_index %lu",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
- -                __entry->index)
+ +                (unsigned long) __entry->index)
+ +);
+ +
+ +DEFINE_EVENT(ext4__page_op, ext4_writepage,
+ +
+ +      TP_PROTO(struct page *page),
+ +
+ +      TP_ARGS(page)
   );
   
   DEFINE_EVENT(ext4__page_op, ext4_readpage,
@@@ -472,7 -487,7 +470,7 @@@ TRACE_EVENT(ext4_invalidatepage
         TP_printk("dev %d,%d ino %lu page_index %lu offset %lu",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
- -                __entry->index, __entry->offset)
+ +                (unsigned long) __entry->index, __entry->offset)
   );
   
   TRACE_EVENT(ext4_discard_blocks,
@@@ -545,10 -560,12 +543,10 @@@ DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_n
   );
   
   TRACE_EVENT(ext4_mb_release_inode_pa,
- -      TP_PROTO(struct super_block *sb,
- -               struct inode *inode,
- -               struct ext4_prealloc_space *pa,
+ +      TP_PROTO(struct ext4_prealloc_space *pa,
                  unsigned long long block, unsigned int count),
   
- -      TP_ARGS(sb, inode, pa, block, count),
+ +      TP_ARGS(pa, block, count),
   
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
@@@ -559,8 -576,8 +557,8 @@@
         ),
   
         TP_fast_assign(
- -              __entry->dev            = sb->s_dev;
- -              __entry->ino            = inode->i_ino;
+ +              __entry->dev            = pa->pa_inode->i_sb->s_dev;
+ +              __entry->ino            = pa->pa_inode->i_ino;
                 __entry->block          = block;
                 __entry->count          = count;
         ),
@@@ -572,9 -589,10 +570,9 @@@
   );
   
   TRACE_EVENT(ext4_mb_release_group_pa,
- -      TP_PROTO(struct super_block *sb,
- -               struct ext4_prealloc_space *pa),
+ +      TP_PROTO(struct ext4_prealloc_space *pa),
   
- -      TP_ARGS(sb, pa),
+ +      TP_ARGS(pa),
   
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
@@@ -584,7 -602,7 +582,7 @@@
         ),
   
         TP_fast_assign(
- -              __entry->dev            = sb->s_dev;
+ +              __entry->dev            = pa->pa_inode->i_sb->s_dev;
                 __entry->pa_pstart      = pa->pa_pstart;
                 __entry->pa_len         = pa->pa_len;
         ),
@@@ -646,10 -664,10 +644,10 @@@ TRACE_EVENT(ext4_request_blocks
                 __field(        ino_t,  ino                     )
                 __field(        unsigned int, flags             )
                 __field(        unsigned int, len               )
- -              __field(        __u64,  logical                 )
+ +              __field(        __u32,  logical                 )
+ +              __field(        __u32,  lleft                   )
+ +              __field(        __u32,  lright                  )
                 __field(        __u64,  goal                    )
- -              __field(        __u64,  lleft                   )
- -              __field(        __u64,  lright                  )
                 __field(        __u64,  pleft                   )
                 __field(        __u64,  pright                  )
         ),
@@@ -667,13 -685,17 +665,13 @@@
                 __entry->pright = ar->pright;
         ),
   
- -      TP_printk("dev %d,%d ino %lu flags %u len %u lblk %llu goal %llu "
- -                "lleft %llu lright %llu pleft %llu pright %llu ",
+ +      TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu "
+ +                "lleft %u lright %u pleft %llu pright %llu ",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
- -                (unsigned long) __entry->ino,
- -                __entry->flags, __entry->len,
- -                (unsigned long long) __entry->logical,
- -                (unsigned long long) __entry->goal,
- -                (unsigned long long) __entry->lleft,
- -                (unsigned long long) __entry->lright,
- -                (unsigned long long) __entry->pleft,
- -                (unsigned long long) __entry->pright)
+ +                (unsigned long) __entry->ino, __entry->flags,
+ +                __entry->len, __entry->logical, __entry->goal,
+ +                __entry->lleft, __entry->lright, __entry->pleft,
+ +                __entry->pright)
   );
   
   TRACE_EVENT(ext4_allocate_blocks,
@@@ -687,10 -709,10 +685,10 @@@
                 __field(        __u64,  block                   )
                 __field(        unsigned int, flags             )
                 __field(        unsigned int, len               )
- -              __field(        __u64,  logical                 )
+ +              __field(        __u32,  logical                 )
+ +              __field(        __u32,  lleft                   )
+ +              __field(        __u32,  lright                  )
                 __field(        __u64,  goal                    )
- -              __field(        __u64,  lleft                   )
- -              __field(        __u64,  lright                  )
                 __field(        __u64,  pleft                   )
                 __field(        __u64,  pright                  )
         ),
@@@ -709,13 -731,17 +707,13 @@@
                 __entry->pright = ar->pright;
         ),
   
- -      TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %llu "
- -                "goal %llu lleft %llu lright %llu pleft %llu pright %llu",
+ +      TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u "
+ +                "goal %llu lleft %u lright %u pleft %llu pright %llu",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
- -                (unsigned long) __entry->ino,
- -                __entry->flags, __entry->len, __entry->block,
- -                (unsigned long long) __entry->logical,
- -                (unsigned long long) __entry->goal,
- -                (unsigned long long) __entry->lleft,
- -                (unsigned long long) __entry->lright,
- -                (unsigned long long) __entry->pleft,
- -                (unsigned long long) __entry->pright)
+ +                (unsigned long) __entry->ino, __entry->flags,
+ +                __entry->len, __entry->block, __entry->logical,
+ +                __entry->goal,  __entry->lleft, __entry->lright,
+ +                __entry->pleft, __entry->pright)
   );
   
   TRACE_EVENT(ext4_free_blocks,
@@@ -727,10 -753,10 +725,10 @@@
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
- -              __field(      umode_t, mode                     )
+ +              __field(        umode_t, mode                   )
                 __field(        __u64,  block                   )
                 __field(        unsigned long,  count           )
- -              __field(         int,   flags                   )
+ +              __field(        int,    flags                   )
         ),
   
         TP_fast_assign(
@@@ -770,7 -796,7 +768,7 @@@ TRACE_EVENT(ext4_sync_file_enter
                 __entry->parent         = dentry->d_parent->d_inode->i_ino;
         ),
   
- -      TP_printk("dev %d,%d ino %ld parent %ld datasync %d ",
+ +      TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
                   (unsigned long) __entry->parent, __entry->datasync)
@@@ -793,7 -819,7 +791,7 @@@ TRACE_EVENT(ext4_sync_file_exit
                 __entry->dev            = inode->i_sb->s_dev;
         ),
   
- -      TP_printk("dev %d,%d ino %ld ret %d",
+ +      TP_printk("dev %d,%d ino %lu ret %d",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
                   __entry->ret)
@@@ -977,7 -1003,7 +975,7 @@@ DECLARE_EVENT_CLASS(ext4__mballoc
                 __entry->result_len     = len;
         ),
   
- -      TP_printk("dev %d,%d inode %lu extent %u/%d/%u ",
+ +      TP_printk("dev %d,%d inode %lu extent %u/%d/%d ",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
                   __entry->result_group, __entry->result_start,
@@@ -1065,7 -1091,7 +1063,7 @@@ TRACE_EVENT(ext4_da_update_reserve_spac
                   "allocated_meta_blocks %d",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
- -                __entry->mode,  (unsigned long long) __entry->i_blocks,
+ +                __entry->mode, __entry->i_blocks,
                   __entry->used_blocks, __entry->reserved_data_blocks,
                   __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
   );
@@@ -1099,7 -1125,7 +1097,7 @@@ TRACE_EVENT(ext4_da_reserve_space
                   "reserved_data_blocks %d reserved_meta_blocks %d",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
- -                __entry->mode, (unsigned long long) __entry->i_blocks,
+ +                __entry->mode, __entry->i_blocks,
                   __entry->md_needed, __entry->reserved_data_blocks,
                   __entry->reserved_meta_blocks)
   );
@@@ -1136,7 -1162,7 +1134,7 @@@ TRACE_EVENT(ext4_da_release_space
                   "allocated_meta_blocks %d",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
- -                __entry->mode, (unsigned long long) __entry->i_blocks,
+ +                __entry->mode, __entry->i_blocks,
                   __entry->freed_blocks, __entry->reserved_data_blocks,
                   __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
   );
@@@ -1211,15 -1237,14 +1209,15 @@@ TRACE_EVENT(ext4_direct_IO_enter
                 __entry->rw     = rw;
         ),
   
- -      TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d",
+ +      TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
- -                (unsigned long long) __entry->pos, __entry->len, __entry->rw)
+ +                __entry->pos, __entry->len, __entry->rw)
   );
   
   TRACE_EVENT(ext4_direct_IO_exit,
- -      TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw, int ret),
+ +      TP_PROTO(struct inode *inode, loff_t offset, unsigned long len,
+ +               int rw, int ret),
   
         TP_ARGS(inode, offset, len, rw, ret),
   
@@@ -1241,10 -1266,10 +1239,10 @@@
                 __entry->ret    = ret;
         ),
   
- -      TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d ret %d",
+ +      TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
- -                (unsigned long long) __entry->pos, __entry->len,
+ +                __entry->pos, __entry->len,
                   __entry->rw, __entry->ret)
   );
   
@@@ -1269,15 -1294,15 +1267,15 @@@ TRACE_EVENT(ext4_fallocate_enter
                 __entry->mode   = mode;
         ),
   
- -      TP_printk("dev %d,%d ino %ld pos %llu len %llu mode %d",
+ +      TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
- -                (unsigned long) __entry->ino,
- -                (unsigned long long) __entry->pos,
- -                (unsigned long long) __entry->len, __entry->mode)
+ +                (unsigned long) __entry->ino, __entry->pos,
+ +                __entry->len, __entry->mode)
   );
   
   TRACE_EVENT(ext4_fallocate_exit,
- -      TP_PROTO(struct inode *inode, loff_t offset, unsigned int max_blocks, int ret),
+ +      TP_PROTO(struct inode *inode, loff_t offset,
+ +               unsigned int max_blocks, int ret),
   
         TP_ARGS(inode, offset, max_blocks, ret),
   
@@@ -1285,7 -1310,7 +1283,7 @@@
                 __field(        ino_t,  ino                     )
                 __field(        dev_t,  dev                     )
                 __field(        loff_t, pos                     )
- -              __field(        unsigned,       blocks          )
+ +              __field(        unsigned int,   blocks          )
                 __field(        int,    ret                     )
         ),
   
@@@ -1297,10 -1322,10 +1295,10 @@@
                 __entry->ret    = ret;
         ),
   
- -      TP_printk("dev %d,%d ino %ld pos %llu blocks %d ret %d",
+ +      TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
- -                (unsigned long long) __entry->pos, __entry->blocks,
+ +                __entry->pos, __entry->blocks,
                   __entry->ret)
   );
   
@@@ -1323,7 -1348,7 +1321,7 @@@ TRACE_EVENT(ext4_unlink_enter
                 __entry->dev            = dentry->d_inode->i_sb->s_dev;
         ),
   
- -      TP_printk("dev %d,%d ino %ld size %lld parent %ld",
+ +      TP_printk("dev %d,%d ino %lu size %lld parent %lu",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino, __entry->size,
                   (unsigned long) __entry->parent)
@@@ -1346,7 -1371,7 +1344,7 @@@ TRACE_EVENT(ext4_unlink_exit
                 __entry->ret            = ret;
         ),
   
- -      TP_printk("dev %d,%d ino %ld ret %d",
+ +      TP_printk("dev %d,%d ino %lu ret %d",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
                   __entry->ret)
@@@ -1360,7 -1385,7 +1358,7 @@@ DECLARE_EVENT_CLASS(ext4__truncate
         TP_STRUCT__entry(
                 __field(        ino_t,          ino             )
                 __field(        dev_t,          dev             )
- -              __field(        blkcnt_t,       blocks          )
+ +              __field(        __u64,          blocks          )
         ),
   
         TP_fast_assign(
@@@ -1369,9 -1394,9 +1367,9 @@@
                 __entry->blocks = inode->i_blocks;
         ),
   
- -      TP_printk("dev %d,%d ino %lu blocks %lu",
+ +      TP_printk("dev %d,%d ino %lu blocks %llu",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
- -                (unsigned long) __entry->ino, (unsigned long) __entry->blocks)
+ +                (unsigned long) __entry->ino, __entry->blocks)
   );
   
   DEFINE_EVENT(ext4__truncate, ext4_truncate_enter,
@@@ -1390,7 -1415,7 +1388,7 @@@ DEFINE_EVENT(ext4__truncate, ext4_trunc
   
   DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
         TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
- -               unsigned len, unsigned flags),
+ +               unsigned int len, unsigned int flags),
   
         TP_ARGS(inode, lblk, len, flags),
   
@@@ -1398,8 -1423,8 +1396,8 @@@
                 __field(        ino_t,          ino             )
                 __field(        dev_t,          dev             )
                 __field(        ext4_lblk_t,    lblk            )
- -              __field(        unsigned,       len             )
- -              __field(        unsigned,       flags           )
+ +              __field(        unsigned int,   len             )
+ +              __field(        unsigned int,   flags           )
         ),
   
         TP_fast_assign(
@@@ -1413,7 -1438,7 +1411,7 @@@
         TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
- -                (unsigned) __entry->lblk, __entry->len, __entry->flags)
+ +                __entry->lblk, __entry->len, __entry->flags)
   );
   
   DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
@@@ -1432,7 -1457,7 +1430,7 @@@ DEFINE_EVENT(ext4__map_blocks_enter, ex
   
   DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
         TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
- -               ext4_fsblk_t pblk, unsigned len, int ret),
+ +               ext4_fsblk_t pblk, unsigned int len, int ret),
   
         TP_ARGS(inode, lblk, pblk, len, ret),
   
@@@ -1441,7 -1466,7 +1439,7 @@@
                 __field(        dev_t,          dev             )
                 __field(        ext4_lblk_t,    lblk            )
                 __field(        ext4_fsblk_t,   pblk            )
- -              __field(        unsigned,       len             )
+ +              __field(        unsigned int,   len             )
                 __field(        int,            ret             )
         ),
   
@@@ -1457,7 -1482,7 +1455,7 @@@
         TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
- -                (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
+ +                __entry->lblk, __entry->pblk,
                   __entry->len, __entry->ret)
   );
   
@@@ -1497,7 -1522,7 +1495,7 @@@ TRACE_EVENT(ext4_ext_load_extent
         TP_printk("dev %d,%d ino %lu lblk %u pblk %llu",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino,
- -                (unsigned) __entry->lblk, (unsigned long long) __entry->pblk)
+ +                __entry->lblk, __entry->pblk)
   );
   
   TRACE_EVENT(ext4_load_inode,
diff --combined mm/backing-dev.c

index 8290b1e,ddd0345..d6edf8d
--- 1/mm/backing-dev.c
--- 2/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@@ -45,6 -45,17 +45,17 @@@ static struct timer_list sync_supers_ti
   static int bdi_sync_supers(void *);
   static void sync_supers_timer_fn(unsigned long);
   
+ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+ {
+       if (wb1 < wb2) {
+               spin_lock(&wb1->list_lock);
+               spin_lock_nested(&wb2->list_lock, 1);
+       } else {
+               spin_lock(&wb2->list_lock);
+               spin_lock_nested(&wb1->list_lock, 1);
+       }
+ }
+ 
   #ifdef CONFIG_DEBUG_FS
   #include <linux/debugfs.h>
   #include <linux/seq_file.h>
@@@ -67,34 -78,42 +78,42 @@@ static int bdi_debug_stats_show(struct 
         struct inode *inode;
   
         nr_dirty = nr_io = nr_more_io = 0;
-       spin_lock(&inode_wb_list_lock);
+       spin_lock(&wb->list_lock);
         list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
                 nr_dirty++;
         list_for_each_entry(inode, &wb->b_io, i_wb_list)
                 nr_io++;
         list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
                 nr_more_io++;
-       spin_unlock(&inode_wb_list_lock);
+       spin_unlock(&wb->list_lock);
   
         global_dirty_limits(&background_thresh, &dirty_thresh);
         bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
   
   #define K(x) ((x) << (PAGE_SHIFT - 10))
         seq_printf(m,
-                  "BdiWriteback:     %8lu kB\n"
-                  "BdiReclaimable:   %8lu kB\n"
-                  "BdiDirtyThresh:   %8lu kB\n"
-                  "DirtyThresh:      %8lu kB\n"
-                  "BackgroundThresh: %8lu kB\n"
-                  "b_dirty:          %8lu\n"
-                  "b_io:             %8lu\n"
-                  "b_more_io:        %8lu\n"
-                  "bdi_list:         %8u\n"
-                  "state:            %8lx\n",
+                  "BdiWriteback:       %10lu kB\n"
+                  "BdiReclaimable:     %10lu kB\n"
+                  "BdiDirtyThresh:     %10lu kB\n"
+                  "DirtyThresh:        %10lu kB\n"
+                  "BackgroundThresh:   %10lu kB\n"
+                  "BdiWritten:         %10lu kB\n"
+                  "BdiWriteBandwidth:  %10lu kBps\n"
+                  "b_dirty:            %10lu\n"
+                  "b_io:               %10lu\n"
+                  "b_more_io:          %10lu\n"
+                  "bdi_list:           %10u\n"
+                  "state:              %10lx\n",
                    (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
                    (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
-                  K(bdi_thresh), K(dirty_thresh),
-                  K(background_thresh), nr_dirty, nr_io, nr_more_io,
+                  K(bdi_thresh),
+                  K(dirty_thresh),
+                  K(background_thresh),
+                  (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+                  (unsigned long) K(bdi->write_bandwidth),
+                  nr_dirty,
+                  nr_io,
+                  nr_more_io,
                    !list_empty(&bdi->bdi_list), bdi->state);
   #undef K
   
@@@ -249,18 -268,6 +268,6 @@@ int bdi_has_dirty_io(struct backing_dev
         return wb_has_dirty_io(&bdi->wb);
   }
   
- static void bdi_flush_io(struct backing_dev_info *bdi)
- {
-       struct writeback_control wbc = {
-               .sync_mode              = WB_SYNC_NONE,
-               .older_than_this        = NULL,
-               .range_cyclic           = 1,
-               .nr_to_write            = 1024,
-       };
- 
-       writeback_inodes_wb(&bdi->wb, &wbc);
- }
- 
   /*
    * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
    * or we risk deadlocking on ->s_umount. The longer term solution would be
@@@ -446,9 -453,10 +453,10 @@@ static int bdi_forker_thread(void *ptr
                         if (IS_ERR(task)) {
                                 /*
                                  * If thread creation fails, force writeout of
-                                * the bdi from the thread.
+                                * the bdi from the thread. Hopefully 1024 is
+                                * large enough for efficient IO.
                                  */
-                               bdi_flush_io(bdi);
+                               writeback_inodes_wb(&bdi->wb, 1024);
                         } else {
                                 /*
                                  * The spinlock makes sure we do not lose
@@@ -505,7 -513,7 +513,7 @@@ static void bdi_remove_from_list(struc
         list_del_rcu(&bdi->bdi_list);
         spin_unlock_bh(&bdi_lock);
   
- -      synchronize_rcu();
+ +      synchronize_rcu_expedited();
   }
   
   int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@@ -606,7 -614,6 +614,7 @@@ static void bdi_prune_sb(struct backing
   void bdi_unregister(struct backing_dev_info *bdi)
   {
         if (bdi->dev) {
+ +              bdi_set_min_ratio(bdi, 0);
                 trace_writeback_bdi_unregister(bdi);
                 bdi_prune_sb(bdi);
                 del_timer_sync(&bdi->wb.wakeup_timer);
@@@ -629,9 -636,15 +637,15 @@@ static void bdi_wb_init(struct bdi_writ
         INIT_LIST_HEAD(&wb->b_dirty);
         INIT_LIST_HEAD(&wb->b_io);
         INIT_LIST_HEAD(&wb->b_more_io);
+       spin_lock_init(&wb->list_lock);
         setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
   }
   
+ /*
+  * Initial write bandwidth: 100 MB/s
+  */
+ #define INIT_BW               (100 << (20 - PAGE_SHIFT))
+ 
   int bdi_init(struct backing_dev_info *bdi)
   {
         int i, err;
@@@ -654,6 -667,13 +668,13 @@@
         }
   
         bdi->dirty_exceeded = 0;
+ 
+       bdi->bw_time_stamp = jiffies;
+       bdi->written_stamp = 0;
+ 
+       bdi->write_bandwidth = INIT_BW;
+       bdi->avg_write_bandwidth = INIT_BW;
+ 
         err = prop_local_init_percpu(&bdi->completions);
   
         if (err) {
@@@ -677,11 -697,12 +698,12 @@@ void bdi_destroy(struct backing_dev_inf
         if (bdi_has_dirty_io(bdi)) {
                 struct bdi_writeback *dst = &default_backing_dev_info.wb;
   
-               spin_lock(&inode_wb_list_lock);
+               bdi_lock_two(&bdi->wb, dst);
                 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
                 list_splice(&bdi->wb.b_io, &dst->b_io);
                 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
-               spin_unlock(&inode_wb_list_lock);
+               spin_unlock(&bdi->wb.list_lock);
+               spin_unlock(&dst->list_lock);
         }
   
         bdi_unregister(bdi);
diff --combined mm/filemap.c

index 10a1711,1e492c3..867d402
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -78,7 -78,10 +78,7 @@@
    *  ->i_mutex                 (generic_file_buffered_write)
    *    ->mmap_sem              (fault_in_pages_readable->do_page_fault)
    *
-  *  inode_wb_list_lock
- - *  ->i_mutex
- - *    ->i_alloc_sem             (various)
- - *
+  *  bdi->wb.list_lock
    *    sb_lock                 (fs/fs-writeback.c)
    *    ->mapping->tree_lock    (__sync_single_inode)
    *
@@@ -96,9 -99,9 +96,9 @@@
    *    ->zone.lru_lock         (check_pte_range->isolate_lru_page)
    *    ->private_lock          (page_remove_rmap->set_page_dirty)
    *    ->tree_lock             (page_remove_rmap->set_page_dirty)
-  *    inode_wb_list_lock      (page_remove_rmap->set_page_dirty)
+  *    bdi.wb->list_lock               (page_remove_rmap->set_page_dirty)
    *    ->inode->i_lock         (page_remove_rmap->set_page_dirty)
-  *    inode_wb_list_lock      (zap_pte_range->set_page_dirty)
+  *    bdi.wb->list_lock               (zap_pte_range->set_page_dirty)
    *    ->inode->i_lock         (zap_pte_range->set_page_dirty)
    *    ->private_lock          (zap_pte_range->__set_page_dirty_buffers)
    *
@@@ -128,7 -131,6 +128,7 @@@ void __delete_from_page_cache(struct pa
   
         radix_tree_delete(&mapping->page_tree, page->index);
         page->mapping = NULL;
+ +      /* Leave page->index set: truncation lookup relies upon it */
         mapping->nrpages--;
         __dec_zone_page_state(page, NR_FILE_PAGES);
         if (PageSwapBacked(page))
@@@ -484,7 -486,6 +484,7 @@@ int add_to_page_cache_locked(struct pag
                         spin_unlock_irq(&mapping->tree_lock);
                 } else {
                         page->mapping = NULL;
+ +                      /* Leave page->index set: truncation relies upon it */
                         spin_unlock_irq(&mapping->tree_lock);
                         mem_cgroup_uncharge_cache_page(page);
                         page_cache_release(page);
@@@ -1794,7 -1795,7 +1794,7 @@@ EXPORT_SYMBOL(generic_file_readonly_mma
   
   static struct page *__read_cache_page(struct address_space *mapping,
                                 pgoff_t index,
- -                              int (*filler)(void *,struct page*),
+ +                              int (*filler)(void *, struct page *),
                                 void *data,
                                 gfp_t gfp)
   {
@@@ -1825,7 -1826,7 +1825,7 @@@ repeat
   
   static struct page *do_read_cache_page(struct address_space *mapping,
                                 pgoff_t index,
- -                              int (*filler)(void *,struct page*),
+ +                              int (*filler)(void *, struct page *),
                                 void *data,
                                 gfp_t gfp)
   
@@@ -1865,7 -1866,7 +1865,7 @@@ out
    * @mapping:  the page's address_space
    * @index:    the page index
    * @filler:   function to perform the read
- - * @data:     destination for read data
+ + * @data:     first arg to filler(data, page) function, often left as NULL
    *
    * Same as read_cache_page, but don't wait for page to become unlocked
    * after submitting it to the filler.
@@@ -1877,7 -1878,7 +1877,7 @@@
    */
   struct page *read_cache_page_async(struct address_space *mapping,
                                 pgoff_t index,
- -                              int (*filler)(void *,struct page*),
+ +                              int (*filler)(void *, struct page *),
                                 void *data)
   {
         return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
@@@ -1925,7 -1926,7 +1925,7 @@@ EXPORT_SYMBOL(read_cache_page_gfp)
    * @mapping:  the page's address_space
    * @index:    the page index
    * @filler:   function to perform the read
- - * @data:     destination for read data
+ + * @data:     first arg to filler(data, page) function, often left as NULL
    *
    * Read into the page cache. If a page already exists, and PageUptodate() is
    * not set, try to fill the page then wait for it to become unlocked.
@@@ -1934,7 -1935,7 +1934,7 @@@
    */
   struct page *read_cache_page(struct address_space *mapping,
                                 pgoff_t index,
- -                              int (*filler)(void *,struct page*),
+ +                              int (*filler)(void *, struct page *),
                                 void *data)
   {
         return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
@@@ -1999,7 -2000,7 +1999,7 @@@ int file_remove_suid(struct file *file
                 error = security_inode_killpriv(dentry);
         if (!error && killsuid)
                 error = __remove_suid(dentry, killsuid);
- -      if (!error)
+ +      if (!error && (inode->i_sb->s_flags & MS_NOSEC))
                 inode->i_flags |= S_NOSEC;
   
         return error;
diff --combined mm/page-writeback.c

index d8767b3,1d78180..d196074
--- 1/mm/page-writeback.c
--- 2/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@@ -36,6 -36,16 +36,16 @@@
   #include <linux/pagevec.h>
   #include <trace/events/writeback.h>
   
+ /*
+  * Sleep at most 200ms at a time in balance_dirty_pages().
+  */
+ #define MAX_PAUSE             max(HZ/5, 1)
+ 
+ /*
+  * Estimate write bandwidth at 200ms intervals.
+  */
+ #define BANDWIDTH_INTERVAL    max(HZ/5, 1)
+ 
   /*
    * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
    * will look to see if it needs to force writeback or throttling.
@@@ -111,6 -121,7 +121,7 @@@ EXPORT_SYMBOL(laptop_mode)
   
   /* End of sysctl-exported parameters */
   
+ unsigned long global_dirty_limit;
   
   /*
    * Scale the writeback cache size proportional to the relative writeout speeds.
@@@ -219,6 -230,7 +230,7 @@@ int dirty_bytes_handler(struct ctl_tabl
    */
   static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
   {
+       __inc_bdi_stat(bdi, BDI_WRITTEN);
         __prop_inc_percpu_max(&vm_completions, &bdi->completions,
                               bdi->max_prop_frac);
   }
@@@ -244,13 -256,8 +256,8 @@@ void task_dirty_inc(struct task_struct 
   static void bdi_writeout_fraction(struct backing_dev_info *bdi,
                 long *numerator, long *denominator)
   {
-       if (bdi_cap_writeback_dirty(bdi)) {
-               prop_fraction_percpu(&vm_completions, &bdi->completions,
+       prop_fraction_percpu(&vm_completions, &bdi->completions,
                                 numerator, denominator);
-       } else {
-               *numerator = 0;
-               *denominator = 1;
-       }
   }
   
   static inline void task_dirties_fraction(struct task_struct *tsk,
@@@ -274,12 -281,13 +281,13 @@@
    * effectively curb the growth of dirty pages. Light dirtiers with high enough
    * dirty threshold may never get throttled.
    */
+ #define TASK_LIMIT_FRACTION 8
   static unsigned long task_dirty_limit(struct task_struct *tsk,
                                        unsigned long bdi_dirty)
   {
         long numerator, denominator;
         unsigned long dirty = bdi_dirty;
-       u64 inv = dirty >> 3;
+       u64 inv = dirty / TASK_LIMIT_FRACTION;
   
         task_dirties_fraction(tsk, &numerator, &denominator);
         inv *= numerator;
@@@ -290,6 -298,12 +298,12 @@@
         return max(dirty, bdi_dirty/2);
   }
   
+ /* Minimum limit for any task */
+ static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
+ {
+       return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
+ }
+ 
   /*
    *
    */
@@@ -397,6 -411,11 +411,11 @@@ unsigned long determine_dirtyable_memor
         return x + 1;   /* Ensure that we never return 0 */
   }
   
+ static unsigned long hard_dirty_limit(unsigned long thresh)
+ {
+       return max(thresh, global_dirty_limit);
+ }
+ 
   /*
    * global_dirty_limits - background-writeback and dirty-throttling thresholds
    *
@@@ -435,12 -454,20 +454,20 @@@ void global_dirty_limits(unsigned long 
         }
         *pbackground = background;
         *pdirty = dirty;
+       trace_global_dirty_state(background, dirty);
   }
   
- /*
+ /**
    * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+  * @bdi: the backing_dev_info to query
+  * @dirty: global dirty limit in pages
    *
-  * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+  * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+  * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+  * And the "limit" in the name is not seriously taken as hard limit in
+  * balance_dirty_pages().
+  *
+  * It allocates high/low dirty limits to fast/slow devices, in order to prevent
    * - starving fast devices
    * - piling up dirty pages (that will take long time to sync) on slow devices
    *
@@@ -468,6 -495,153 +495,153 @@@ unsigned long bdi_dirty_limit(struct ba
         return bdi_dirty;
   }
   
+ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+                                      unsigned long elapsed,
+                                      unsigned long written)
+ {
+       const unsigned long period = roundup_pow_of_two(3 * HZ);
+       unsigned long avg = bdi->avg_write_bandwidth;
+       unsigned long old = bdi->write_bandwidth;
+       u64 bw;
+ 
+       /*
+        * bw = written * HZ / elapsed
+        *
+        *                   bw * elapsed + write_bandwidth * (period - elapsed)
+        * write_bandwidth = ---------------------------------------------------
+        *                                          period
+        */
+       bw = written - bdi->written_stamp;
+       bw *= HZ;
+       if (unlikely(elapsed > period)) {
+               do_div(bw, elapsed);
+               avg = bw;
+               goto out;
+       }
+       bw += (u64)bdi->write_bandwidth * (period - elapsed);
+       bw >>= ilog2(period);
+ 
+       /*
+        * one more level of smoothing, for filtering out sudden spikes
+        */
+       if (avg > old && old >= (unsigned long)bw)
+               avg -= (avg - old) >> 3;
+ 
+       if (avg < old && old <= (unsigned long)bw)
+               avg += (old - avg) >> 3;
+ 
+ out:
+       bdi->write_bandwidth = bw;
+       bdi->avg_write_bandwidth = avg;
+ }
+ 
+ /*
+  * The global dirtyable memory and dirty threshold could be suddenly knocked
+  * down by a large amount (eg. on the startup of KVM in a swapless system).
+  * This may throw the system into deep dirty exceeded state and throttle
+  * heavy/light dirtiers alike. To retain good responsiveness, maintain
+  * global_dirty_limit for tracking slowly down to the knocked down dirty
+  * threshold.
+  */
+ static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+ {
+       unsigned long limit = global_dirty_limit;
+ 
+       /*
+        * Follow up in one step.
+        */
+       if (limit < thresh) {
+               limit = thresh;
+               goto update;
+       }
+ 
+       /*
+        * Follow down slowly. Use the higher one as the target, because thresh
+        * may drop below dirty. This is exactly the reason to introduce
+        * global_dirty_limit which is guaranteed to lie above the dirty pages.
+        */
+       thresh = max(thresh, dirty);
+       if (limit > thresh) {
+               limit -= (limit - thresh) >> 5;
+               goto update;
+       }
+       return;
+ update:
+       global_dirty_limit = limit;
+ }
+ 
+ static void global_update_bandwidth(unsigned long thresh,
+                                   unsigned long dirty,
+                                   unsigned long now)
+ {
+       static DEFINE_SPINLOCK(dirty_lock);
+       static unsigned long update_time;
+ 
+       /*
+        * check locklessly first to optimize away locking for the most time
+        */
+       if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+               return;
+ 
+       spin_lock(&dirty_lock);
+       if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
+               update_dirty_limit(thresh, dirty);
+               update_time = now;
+       }
+       spin_unlock(&dirty_lock);
+ }
+ 
+ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+                           unsigned long thresh,
+                           unsigned long dirty,
+                           unsigned long bdi_thresh,
+                           unsigned long bdi_dirty,
+                           unsigned long start_time)
+ {
+       unsigned long now = jiffies;
+       unsigned long elapsed = now - bdi->bw_time_stamp;
+       unsigned long written;
+ 
+       /*
+        * rate-limit, only update once every 200ms.
+        */
+       if (elapsed < BANDWIDTH_INTERVAL)
+               return;
+ 
+       written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+ 
+       /*
+        * Skip quiet periods when disk bandwidth is under-utilized.
+        * (at least 1s idle time between two flusher runs)
+        */
+       if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+               goto snapshot;
+ 
+       if (thresh)
+               global_update_bandwidth(thresh, dirty, now);
+ 
+       bdi_update_write_bandwidth(bdi, elapsed, written);
+ 
+ snapshot:
+       bdi->written_stamp = written;
+       bdi->bw_time_stamp = now;
+ }
+ 
+ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+                                unsigned long thresh,
+                                unsigned long dirty,
+                                unsigned long bdi_thresh,
+                                unsigned long bdi_dirty,
+                                unsigned long start_time)
+ {
+       if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+               return;
+       spin_lock(&bdi->wb.list_lock);
+       __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
+                              start_time);
+       spin_unlock(&bdi->wb.list_lock);
+ }
+ 
   /*
    * balance_dirty_pages() must be called by processes which are generating dirty
    * data.  It looks at the number of dirty pages in the machine and will force
@@@ -478,27 -652,25 +652,25 @@@
   static void balance_dirty_pages(struct address_space *mapping,
                                 unsigned long write_chunk)
   {
-       long nr_reclaimable, bdi_nr_reclaimable;
-       long nr_writeback, bdi_nr_writeback;
+       unsigned long nr_reclaimable, bdi_nr_reclaimable;
+       unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
+       unsigned long bdi_dirty;
         unsigned long background_thresh;
         unsigned long dirty_thresh;
         unsigned long bdi_thresh;
+       unsigned long task_bdi_thresh;
+       unsigned long min_task_bdi_thresh;
         unsigned long pages_written = 0;
         unsigned long pause = 1;
         bool dirty_exceeded = false;
+       bool clear_dirty_exceeded = true;
         struct backing_dev_info *bdi = mapping->backing_dev_info;
+       unsigned long start_time = jiffies;
   
         for (;;) {
-               struct writeback_control wbc = {
-                       .sync_mode      = WB_SYNC_NONE,
-                       .older_than_this = NULL,
-                       .nr_to_write    = write_chunk,
-                       .range_cyclic   = 1,
-               };
- 
                 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
                                         global_page_state(NR_UNSTABLE_NFS);
-               nr_writeback = global_page_state(NR_WRITEBACK);
+               nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
   
                 global_dirty_limits(&background_thresh, &dirty_thresh);
   
@@@ -507,12 -679,12 +679,12 @@@
                  * catch-up. This avoids (excessively) small writeouts
                  * when the bdi limits are ramping up.
                  */
-               if (nr_reclaimable + nr_writeback <=
-                               (background_thresh + dirty_thresh) / 2)
+               if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
                         break;
   
                 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-               bdi_thresh = task_dirty_limit(current, bdi_thresh);
+               min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
+               task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
   
                 /*
                  * In order to avoid the stacked BDI deadlock we need
@@@ -524,12 -696,14 +696,14 @@@
                  * actually dirty; with m+n sitting in the percpu
                  * deltas.
                  */
-               if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+               if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
                         bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-                       bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+                       bdi_dirty = bdi_nr_reclaimable +
+                                   bdi_stat_sum(bdi, BDI_WRITEBACK);
                 } else {
                         bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-                       bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+                       bdi_dirty = bdi_nr_reclaimable +
+                                   bdi_stat(bdi, BDI_WRITEBACK);
                 }
   
                 /*
@@@ -538,9 -712,10 +712,10 @@@
                  * bdi or process from holding back light ones; The latter is
                  * the last resort safeguard.
                  */
-               dirty_exceeded =
-                       (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
-                       || (nr_reclaimable + nr_writeback > dirty_thresh);
+               dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
+                                 (nr_dirty > dirty_thresh);
+               clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
+                                       (nr_dirty <= dirty_thresh);
   
                 if (!dirty_exceeded)
                         break;
@@@ -548,6 -723,9 +723,9 @@@
                 if (!bdi->dirty_exceeded)
                         bdi->dirty_exceeded = 1;
   
+               bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
+                                    bdi_thresh, bdi_dirty, start_time);
+ 
                 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
                  * Unstable writes are a feature of certain networked
                  * filesystems (i.e. NFS) in which data may have been
@@@ -557,17 -735,40 +735,40 @@@
                  * threshold otherwise wait until the disk writes catch
                  * up.
                  */
-               trace_wbc_balance_dirty_start(&wbc, bdi);
-               if (bdi_nr_reclaimable > bdi_thresh) {
-                       writeback_inodes_wb(&bdi->wb, &wbc);
-                       pages_written += write_chunk - wbc.nr_to_write;
-                       trace_wbc_balance_dirty_written(&wbc, bdi);
+               trace_balance_dirty_start(bdi);
+               if (bdi_nr_reclaimable > task_bdi_thresh) {
+                       pages_written += writeback_inodes_wb(&bdi->wb,
+                                                            write_chunk);
+                       trace_balance_dirty_written(bdi, pages_written);
                         if (pages_written >= write_chunk)
                                 break;          /* We've done our duty */
                 }
-               trace_wbc_balance_dirty_wait(&wbc, bdi);
                 __set_current_state(TASK_UNINTERRUPTIBLE);
                 io_schedule_timeout(pause);
+               trace_balance_dirty_wait(bdi);
+ 
+               dirty_thresh = hard_dirty_limit(dirty_thresh);
+               /*
+                * max-pause area. If dirty exceeded but still within this
+                * area, no need to sleep for more than 200ms: (a) 8 pages per
+                * 200ms is typically more than enough to curb heavy dirtiers;
+                * (b) the pause time limit makes the dirtiers more responsive.
+                */
+               if (nr_dirty < dirty_thresh +
+                              dirty_thresh / DIRTY_MAXPAUSE_AREA &&
+                   time_after(jiffies, start_time + MAX_PAUSE))
+                       break;
+               /*
+                * pass-good area. When some bdi gets blocked (eg. NFS server
+                * not responding), or write bandwidth dropped dramatically due
+                * to concurrent reads, or dirty threshold suddenly dropped and
+                * the dirty pages cannot be brought down anytime soon (eg. on
+                * slow USB stick), at least let go of the good bdi's.
+                */
+               if (nr_dirty < dirty_thresh +
+                              dirty_thresh / DIRTY_PASSGOOD_AREA &&
+                   bdi_dirty < bdi_thresh)
+                       break;
   
                 /*
                  * Increase the delay for each loop, up to our previous
@@@ -578,7 -779,8 +779,8 @@@
                         pause = HZ / 10;
         }
   
-       if (!dirty_exceeded && bdi->dirty_exceeded)
+       /* Clear dirty_exceeded flag only when no task can exceed the limit */
+       if (clear_dirty_exceeded && bdi->dirty_exceeded)
                 bdi->dirty_exceeded = 0;
   
         if (writeback_in_progress(bdi))
@@@ -626,9 -828,13 +828,13 @@@ static DEFINE_PER_CPU(unsigned long, bd
   void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
                                         unsigned long nr_pages_dirtied)
   {
+       struct backing_dev_info *bdi = mapping->backing_dev_info;
         unsigned long ratelimit;
         unsigned long *p;
   
+       if (!bdi_cap_account_dirty(bdi))
+               return;
+ 
         ratelimit = ratelimit_pages;
         if (mapping->backing_dev_info->dirty_exceeded)
                 ratelimit = 8;
@@@ -892,12 -1098,12 +1098,12 @@@ int write_cache_pages(struct address_sp
                         range_whole = 1;
                 cycled = 1; /* ignore range_cyclic tests */
         }
-       if (wbc->sync_mode == WB_SYNC_ALL)
+       if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                 tag = PAGECACHE_TAG_TOWRITE;
         else
                 tag = PAGECACHE_TAG_DIRTY;
   retry:
-       if (wbc->sync_mode == WB_SYNC_ALL)
+       if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                 tag_pages_for_writeback(mapping, index, end);
         done_index = index;
         while (!done && (index <= end)) {
@@@ -1141,6 -1347,7 +1347,6 @@@ EXPORT_SYMBOL(account_page_dirtied)
   void account_page_writeback(struct page *page)
   {
         inc_zone_page_state(page, NR_WRITEBACK);
- -      inc_zone_page_state(page, NR_WRITTEN);
   }
   EXPORT_SYMBOL(account_page_writeback);
   
@@@ -1357,10 -1564,8 +1563,10 @@@ int test_clear_page_writeback(struct pa
         } else {
                 ret = TestClearPageWriteback(page);
         }
- -      if (ret)
+ +      if (ret) {
                 dec_zone_page_state(page, NR_WRITEBACK);
+ +              inc_zone_page_state(page, NR_WRITTEN);
+ +      }
         return ret;
   }
   
@@@ -1406,6 -1611,10 +1612,6 @@@ EXPORT_SYMBOL(test_set_page_writeback)
    */
   int mapping_tagged(struct address_space *mapping, int tag)
   {
- -      int ret;
- -      rcu_read_lock();
- -      ret = radix_tree_tagged(&mapping->page_tree, tag);
- -      rcu_read_unlock();
- -      return ret;
+ +      return radix_tree_tagged(&mapping->page_tree, tag);
   }
   EXPORT_SYMBOL(mapping_tagged);
diff --combined mm/rmap.c

index 9701574,d04e36a..8005080
--- 1/mm/rmap.c
--- 2/mm/rmap.c
+++ b/mm/rmap.c
@@@ -21,6 -21,7 +21,6 @@@
    * Lock ordering in mm:
    *
    * inode->i_mutex     (while writing or truncating, not reading or faulting)
- - *   inode->i_alloc_sem (vmtruncate_range)
    *   mm->mmap_sem
    *     page->flags PG_locked (lock_page)
    *       mapping->i_mmap_mutex
@@@ -31,14 -32,15 +31,14 @@@
    *               mmlist_lock (in mmput, drain_mmlist and others)
    *               mapping->private_lock (in __set_page_dirty_buffers)
    *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
-  *               inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
+  *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
    *                 sb_lock (within inode_lock in fs/fs-writeback.c)
    *                 mapping->tree_lock (widely used, in set_page_dirty,
    *                           in arch-dependent flush_dcache_mmap_lock,
-  *                           within inode_wb_list_lock in __sync_single_inode)
+  *                           within bdi.wb->list_lock in __sync_single_inode)
    *
- - * (code doesn't rely on that order so it could be switched around)
- - * ->tasklist_lock
- - *   anon_vma->mutex      (memory_failure, collect_procs_anon)
+ + * anon_vma->mutex,mapping->i_mutex      (memory_failure, collect_procs_anon)
+ + *   ->tasklist_lock
    *     pte map lock
    */
   
@@@ -110,9 -112,9 +110,9 @@@ static inline void anon_vma_free(struc
         kmem_cache_free(anon_vma_cachep, anon_vma);
   }
   
- -static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
+ +static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
   {
- -      return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
+ +      return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
   }
   
   static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
@@@ -157,7 -159,7 +157,7 @@@ int anon_vma_prepare(struct vm_area_str
                 struct mm_struct *mm = vma->vm_mm;
                 struct anon_vma *allocated;
   
- -              avc = anon_vma_chain_alloc();
+ +              avc = anon_vma_chain_alloc(GFP_KERNEL);
                 if (!avc)
                         goto out_enomem;
   
@@@ -198,32 -200,6 +198,32 @@@
         return -ENOMEM;
   }
   
+ +/*
+ + * This is a useful helper function for locking the anon_vma root as
+ + * we traverse the vma->anon_vma_chain, looping over anon_vma's that
+ + * have the same vma.
+ + *
+ + * Such anon_vma's should have the same root, so you'd expect to see
+ + * just a single mutex_lock for the whole traversal.
+ + */
+ +static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
+ +{
+ +      struct anon_vma *new_root = anon_vma->root;
+ +      if (new_root != root) {
+ +              if (WARN_ON_ONCE(root))
+ +                      mutex_unlock(&root->mutex);
+ +              root = new_root;
+ +              mutex_lock(&root->mutex);
+ +      }
+ +      return root;
+ +}
+ +
+ +static inline void unlock_anon_vma_root(struct anon_vma *root)
+ +{
+ +      if (root)
+ +              mutex_unlock(&root->mutex);
+ +}
+ +
   static void anon_vma_chain_link(struct vm_area_struct *vma,
                                 struct anon_vma_chain *avc,
                                 struct anon_vma *anon_vma)
@@@ -232,11 -208,13 +232,11 @@@
         avc->anon_vma = anon_vma;
         list_add(&avc->same_vma, &vma->anon_vma_chain);
   
- -      anon_vma_lock(anon_vma);
         /*
          * It's critical to add new vmas to the tail of the anon_vma,
          * see comment in huge_memory.c:__split_huge_page().
          */
         list_add_tail(&avc->same_anon_vma, &anon_vma->head);
- -      anon_vma_unlock(anon_vma);
   }
   
   /*
@@@ -246,24 -224,13 +246,24 @@@
   int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
   {
         struct anon_vma_chain *avc, *pavc;
+ +      struct anon_vma *root = NULL;
   
         list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
- -              avc = anon_vma_chain_alloc();
- -              if (!avc)
- -                      goto enomem_failure;
- -              anon_vma_chain_link(dst, avc, pavc->anon_vma);
+ +              struct anon_vma *anon_vma;
+ +
+ +              avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
+ +              if (unlikely(!avc)) {
+ +                      unlock_anon_vma_root(root);
+ +                      root = NULL;
+ +                      avc = anon_vma_chain_alloc(GFP_KERNEL);
+ +                      if (!avc)
+ +                              goto enomem_failure;
+ +              }
+ +              anon_vma = pavc->anon_vma;
+ +              root = lock_anon_vma_root(root, anon_vma);
+ +              anon_vma_chain_link(dst, avc, anon_vma);
         }
+ +      unlock_anon_vma_root(root);
         return 0;
   
    enomem_failure:
@@@ -296,7 -263,7 +296,7 @@@ int anon_vma_fork(struct vm_area_struc
         anon_vma = anon_vma_alloc();
         if (!anon_vma)
                 goto out_error;
- -      avc = anon_vma_chain_alloc();
+ +      avc = anon_vma_chain_alloc(GFP_KERNEL);
         if (!avc)
                 goto out_error_free_anon_vma;
   
@@@ -313,9 -280,7 +313,9 @@@
         get_anon_vma(anon_vma->root);
         /* Mark this anon_vma as the one where our new (COWed) pages go. */
         vma->anon_vma = anon_vma;
+ +      anon_vma_lock(anon_vma);
         anon_vma_chain_link(vma, avc, anon_vma);
+ +      anon_vma_unlock(anon_vma);
   
         return 0;
   
@@@ -326,43 -291,36 +326,43 @@@
         return -ENOMEM;
   }
   
- -static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
- -{
- -      struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
- -      int empty;
- -
- -      /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
- -      if (!anon_vma)
- -              return;
- -
- -      anon_vma_lock(anon_vma);
- -      list_del(&anon_vma_chain->same_anon_vma);
- -
- -      /* We must garbage collect the anon_vma if it's empty */
- -      empty = list_empty(&anon_vma->head);
- -      anon_vma_unlock(anon_vma);
- -
- -      if (empty)
- -              put_anon_vma(anon_vma);
- -}
- -
   void unlink_anon_vmas(struct vm_area_struct *vma)
   {
         struct anon_vma_chain *avc, *next;
+ +      struct anon_vma *root = NULL;
   
         /*
          * Unlink each anon_vma chained to the VMA.  This list is ordered
          * from newest to oldest, ensuring the root anon_vma gets freed last.
          */
         list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
- -              anon_vma_unlink(avc);
+ +              struct anon_vma *anon_vma = avc->anon_vma;
+ +
+ +              root = lock_anon_vma_root(root, anon_vma);
+ +              list_del(&avc->same_anon_vma);
+ +
+ +              /*
+ +               * Leave empty anon_vmas on the list - we'll need
+ +               * to free them outside the lock.
+ +               */
+ +              if (list_empty(&anon_vma->head))
+ +                      continue;
+ +
+ +              list_del(&avc->same_vma);
+ +              anon_vma_chain_free(avc);
+ +      }
+ +      unlock_anon_vma_root(root);
+ +
+ +      /*
+ +       * Iterate the list once more, it now only contains empty and unlinked
+ +       * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
+ +       * needing to acquire the anon_vma->root->mutex.
+ +       */
+ +      list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ +              struct anon_vma *anon_vma = avc->anon_vma;
+ +
+ +              put_anon_vma(anon_vma);
+ +
                 list_del(&avc->same_vma);
                 anon_vma_chain_free(avc);
         }
@@@ -869,11 -827,11 +869,11 @@@ int page_referenced(struct page *page
                                                                 vm_flags);
                 if (we_locked)
                         unlock_page(page);
+ +
+ +              if (page_test_and_clear_young(page_to_pfn(page)))
+ +                      referenced++;
         }
   out:
- -      if (page_test_and_clear_young(page_to_pfn(page)))
- -              referenced++;
- -
         return referenced;
   }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 26 Jul 2011 17:39:54 +0000 (10:39 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 26 Jul 2011 17:39:54 +0000 (10:39 -0700)
		1	2
fs/block_dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fs-writeback.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/write.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/trace/events/ext4.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/backing-dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page-writeback.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/rmap.c	patch \|	diff1 \|	diff2 \|	blob \| history