Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 26 Jul 2011 17:39:54 +0000 (10:39 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 26 Jul 2011 17:39:54 +0000 (10:39 -0700)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback: (27 commits)
  mm: properly reflect task dirty limits in dirty_exceeded logic
  writeback: don't busy retry writeback on new/freeing inodes
  writeback: scale IO chunk size up to half device bandwidth
  writeback: trace global_dirty_state
  writeback: introduce max-pause and pass-good dirty limits
  writeback: introduce smoothed global dirty limit
  writeback: consolidate variable names in balance_dirty_pages()
  writeback: show bdi write bandwidth in debugfs
  writeback: bdi write bandwidth estimation
  writeback: account per-bdi accumulated written pages
  writeback: make writeback_control.nr_to_write straight
  writeback: skip tmpfs early in balance_dirty_pages_ratelimited_nr()
  writeback: trace event writeback_queue_io
  writeback: trace event writeback_single_inode
  writeback: remove .nonblocking and .encountered_congestion
  writeback: remove writeback_control.more_io
  writeback: skip balance_dirty_pages() for in-memory fs
  writeback: add bdi_dirty_limit() kernel-doc
  writeback: avoid extra sync work at enqueue time
  writeback: elevate queue_io() into wb_writeback()
  ...

Fix up trivial conflicts in fs/fs-writeback.c and mm/filemap.c

1  2 
fs/block_dev.c
fs/ext4/inode.c
fs/fs-writeback.c
fs/inode.c
fs/nfs/write.c
include/trace/events/ext4.h
mm/backing-dev.c
mm/filemap.c
mm/page-writeback.c
mm/rmap.c

diff --combined fs/block_dev.c
@@@ -44,24 -44,28 +44,28 @@@ inline struct block_device *I_BDEV(stru
  {
        return &BDEV_I(inode)->bdev;
  }
  EXPORT_SYMBOL(I_BDEV);
  
  /*
-  * move the inode from it's current bdi to the a new bdi. if the inode is dirty
-  * we need to move it onto the dirty list of @dst so that the inode is always
-  * on the right list.
+  * Move the inode from its current bdi to a new bdi. If the inode is dirty we
+  * need to move it onto the dirty list of @dst so that the inode is always on
+  * the right list.
   */
  static void bdev_inode_switch_bdi(struct inode *inode,
                        struct backing_dev_info *dst)
  {
-       spin_lock(&inode_wb_list_lock);
+       struct backing_dev_info *old = inode->i_data.backing_dev_info;
+       if (unlikely(dst == old))               /* deadlock avoidance */
+               return;
+       bdi_lock_two(&old->wb, &dst->wb);
        spin_lock(&inode->i_lock);
        inode->i_data.backing_dev_info = dst;
        if (inode->i_state & I_DIRTY)
                list_move(&inode->i_wb_list, &dst->wb.b_dirty);
        spin_unlock(&inode->i_lock);
-       spin_unlock(&inode_wb_list_lock);
+       spin_unlock(&old->wb.list_lock);
+       spin_unlock(&dst->wb.list_lock);
  }
  
  static sector_t max_block(struct block_device *bdev)
@@@ -355,30 -359,25 +359,30 @@@ static loff_t block_llseek(struct file 
        mutex_lock(&bd_inode->i_mutex);
        size = i_size_read(bd_inode);
  
 +      retval = -EINVAL;
        switch (origin) {
 -              case 2:
 +              case SEEK_END:
                        offset += size;
                        break;
 -              case 1:
 +              case SEEK_CUR:
                        offset += file->f_pos;
 +              case SEEK_SET:
 +                      break;
 +              default:
 +                      goto out;
        }
 -      retval = -EINVAL;
        if (offset >= 0 && offset <= size) {
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                }
                retval = offset;
        }
 +out:
        mutex_unlock(&bd_inode->i_mutex);
        return retval;
  }
        
 -int blkdev_fsync(struct file *filp, int datasync)
 +int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
  {
        struct inode *bd_inode = filp->f_mapping->host;
        struct block_device *bdev = I_BDEV(bd_inode);
         * i_mutex and doing so causes performance issues with concurrent
         * O_SYNC writers to a block device.
         */
 -      mutex_unlock(&bd_inode->i_mutex);
 -
        error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
        if (error == -EOPNOTSUPP)
                error = 0;
  
 -      mutex_lock(&bd_inode->i_mutex);
 -
        return error;
  }
  EXPORT_SYMBOL(blkdev_fsync);
@@@ -763,19 -766,7 +767,19 @@@ static struct block_device *bd_start_cl
        if (!disk)
                return ERR_PTR(-ENXIO);
  
 -      whole = bdget_disk(disk, 0);
 +      /*
 +       * Normally, @bdev should equal what's returned from bdget_disk()
 +       * if partno is 0; however, some drivers (floppy) use multiple
 +       * bdev's for the same physical device and @bdev may be one of the
 +       * aliases.  Keep @bdev if partno is 0.  This means claimer
 +       * tracking is broken for those devices but it has always been that
 +       * way.
 +       */
 +      if (partno)
 +              whole = bdget_disk(disk, 0);
 +      else
 +              whole = bdgrab(bdev);
 +
        module_put(disk->fops->owner);
        put_disk(disk);
        if (!whole)
@@@ -1448,8 -1439,6 +1452,8 @@@ static int __blkdev_put(struct block_de
  
  int blkdev_put(struct block_device *bdev, fmode_t mode)
  {
 +      mutex_lock(&bdev->bd_mutex);
 +
        if (mode & FMODE_EXCL) {
                bool bdev_free;
  
                 * are protected with bdev_lock.  bd_mutex is to
                 * synchronize disk_holder unlinking.
                 */
 -              mutex_lock(&bdev->bd_mutex);
                spin_lock(&bdev_lock);
  
                WARN_ON_ONCE(--bdev->bd_holders < 0);
                 * If this was the last claim, remove holder link and
                 * unblock evpoll if it was a write holder.
                 */
 -              if (bdev_free) {
 -                      if (bdev->bd_write_holder) {
 -                              disk_unblock_events(bdev->bd_disk);
 -                              disk_check_events(bdev->bd_disk);
 -                              bdev->bd_write_holder = false;
 -                      }
 +              if (bdev_free && bdev->bd_write_holder) {
 +                      disk_unblock_events(bdev->bd_disk);
 +                      bdev->bd_write_holder = false;
                }
 -
 -              mutex_unlock(&bdev->bd_mutex);
        }
  
 +      /*
 +       * Trigger event checking and tell drivers to flush MEDIA_CHANGE
 +       * event.  This is to ensure detection of media removal commanded
 +       * from userland - e.g. eject(1).
 +       */
 +      disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
 +
 +      mutex_unlock(&bdev->bd_mutex);
 +
        return __blkdev_put(bdev, mode, 0);
  }
  EXPORT_SYMBOL(blkdev_put);
diff --combined fs/ext4/inode.c
@@@ -2634,7 -2634,7 +2634,7 @@@ static int ext4_writepage(struct page *
        struct buffer_head *page_bufs = NULL;
        struct inode *inode = page->mapping->host;
  
 -      trace_ext4_writepage(inode, page);
 +      trace_ext4_writepage(page);
        size = i_size_read(inode);
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
@@@ -2741,7 -2741,7 +2741,7 @@@ static int write_cache_pages_da(struct 
        index = wbc->range_start >> PAGE_CACHE_SHIFT;
        end = wbc->range_end >> PAGE_CACHE_SHIFT;
  
-       if (wbc->sync_mode == WB_SYNC_ALL)
+       if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;
@@@ -2973,7 -2973,7 +2973,7 @@@ static int ext4_da_writepages(struct ad
        }
  
  retry:
-       if (wbc->sync_mode == WB_SYNC_ALL)
+       if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, index, end);
  
        while (!ret && wbc->nr_to_write > 0) {
@@@ -3501,8 -3501,10 +3501,8 @@@ retry
                                 offset, nr_segs,
                                 ext4_get_block, NULL, NULL, 0);
        else {
 -              ret = blockdev_direct_IO(rw, iocb, inode,
 -                               inode->i_sb->s_bdev, iov,
 -                               offset, nr_segs,
 -                               ext4_get_block, NULL);
 +              ret = blockdev_direct_IO(rw, iocb, inode, iov,
 +                               offset, nr_segs, ext4_get_block);
  
                if (unlikely((rw & WRITE) && ret < 0)) {
                        loff_t isize = i_size_read(inode);
@@@ -3573,7 -3575,6 +3573,7 @@@ static void ext4_end_io_dio(struct kioc
                            ssize_t size, void *private, int ret,
                            bool is_async)
  {
 +      struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
          ext4_io_end_t *io_end = iocb->private;
        struct workqueue_struct *wq;
        unsigned long flags;
  out:
                if (is_async)
                        aio_complete(iocb, ret, 0);
 +              inode_dio_done(inode);
                return;
        }
  
        /* queue the work to convert unwritten extents to written */
        queue_work(wq, &io_end->work);
        iocb->private = NULL;
 +
 +      /* XXX: probably should move into the real I/O completion handler */
 +      inode_dio_done(inode);
  }
  
  static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
@@@ -3751,13 -3748,11 +3751,13 @@@ static ssize_t ext4_ext_direct_IO(int r
                        EXT4_I(inode)->cur_aio_dio = iocb->private;
                }
  
 -              ret = blockdev_direct_IO(rw, iocb, inode,
 +              ret = __blockdev_direct_IO(rw, iocb, inode,
                                         inode->i_sb->s_bdev, iov,
                                         offset, nr_segs,
                                         ext4_get_block_write,
 -                                       ext4_end_io_dio);
 +                                       ext4_end_io_dio,
 +                                       NULL,
 +                                       DIO_LOCKING | DIO_SKIP_HOLES);
                if (iocb->private)
                        EXT4_I(inode)->cur_aio_dio = NULL;
                /*
@@@ -5356,8 -5351,6 +5356,8 @@@ int ext4_setattr(struct dentry *dentry
        }
  
        if (attr->ia_valid & ATTR_SIZE) {
 +              inode_dio_wait(inode);
 +
                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  
@@@ -5850,84 -5843,80 +5850,84 @@@ int ext4_page_mkwrite(struct vm_area_st
        struct page *page = vmf->page;
        loff_t size;
        unsigned long len;
 -      int ret = -EINVAL;
 -      void *fsdata;
 +      int ret;
        struct file *file = vma->vm_file;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
 +      handle_t *handle;
 +      get_block_t *get_block;
 +      int retries = 0;
  
        /*
 -       * Get i_alloc_sem to stop truncates messing with the inode. We cannot
 -       * get i_mutex because we are already holding mmap_sem.
 +       * This check is racy but catches the common case. We rely on
 +       * __block_page_mkwrite() to do a reliable check.
         */
 -      down_read(&inode->i_alloc_sem);
 -      size = i_size_read(inode);
 -      if (page->mapping != mapping || size <= page_offset(page)
 -          || !PageUptodate(page)) {
 -              /* page got truncated from under us? */
 -              goto out_unlock;
 +      vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 +      /* Delalloc case is easy... */
 +      if (test_opt(inode->i_sb, DELALLOC) &&
 +          !ext4_should_journal_data(inode) &&
 +          !ext4_nonda_switch(inode->i_sb)) {
 +              do {
 +                      ret = __block_page_mkwrite(vma, vmf,
 +                                                 ext4_da_get_block_prep);
 +              } while (ret == -ENOSPC &&
 +                     ext4_should_retry_alloc(inode->i_sb, &retries));
 +              goto out_ret;
        }
 -      ret = 0;
  
        lock_page(page);
 -      wait_on_page_writeback(page);
 -      if (PageMappedToDisk(page)) {
 -              up_read(&inode->i_alloc_sem);
 -              return VM_FAULT_LOCKED;
 +      size = i_size_read(inode);
 +      /* Page got truncated from under us? */
 +      if (page->mapping != mapping || page_offset(page) > size) {
 +              unlock_page(page);
 +              ret = VM_FAULT_NOPAGE;
 +              goto out;
        }
  
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
        else
                len = PAGE_CACHE_SIZE;
 -
        /*
 -       * return if we have all the buffers mapped. This avoid
 -       * the need to call write_begin/write_end which does a
 -       * journal_start/journal_stop which can block and take
 -       * long time
 +       * Return if we have all the buffers mapped. This avoids the need to do
 +       * journal_start/journal_stop which can block and take a long time
         */
        if (page_has_buffers(page)) {
                if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
                                        ext4_bh_unmapped)) {
 -                      up_read(&inode->i_alloc_sem);
 -                      return VM_FAULT_LOCKED;
 +                      /* Wait so that we don't change page under IO */
 +                      wait_on_page_writeback(page);
 +                      ret = VM_FAULT_LOCKED;
 +                      goto out;
                }
        }
        unlock_page(page);
 -      /*
 -       * OK, we need to fill the hole... Do write_begin write_end
 -       * to do block allocation/reservation.We are not holding
 -       * inode.i__mutex here. That allow * parallel write_begin,
 -       * write_end call. lock_page prevent this from happening
 -       * on the same page though
 -       */
 -      ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
 -                      len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 -      if (ret < 0)
 -              goto out_unlock;
 -      ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
 -                      len, len, page, fsdata);
 -      if (ret < 0)
 -              goto out_unlock;
 -      ret = 0;
 -
 -      /*
 -       * write_begin/end might have created a dirty page and someone
 -       * could wander in and start the IO.  Make sure that hasn't
 -       * happened.
 -       */
 -      lock_page(page);
 -      wait_on_page_writeback(page);
 -      up_read(&inode->i_alloc_sem);
 -      return VM_FAULT_LOCKED;
 -out_unlock:
 -      if (ret)
 +      /* OK, we need to fill the hole... */
 +      if (ext4_should_dioread_nolock(inode))
 +              get_block = ext4_get_block_write;
 +      else
 +              get_block = ext4_get_block;
 +retry_alloc:
 +      handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 +      if (IS_ERR(handle)) {
                ret = VM_FAULT_SIGBUS;
 -      up_read(&inode->i_alloc_sem);
 +              goto out;
 +      }
 +      ret = __block_page_mkwrite(vma, vmf, get_block);
 +      if (!ret && ext4_should_journal_data(inode)) {
 +              if (walk_page_buffers(handle, page_buffers(page), 0,
 +                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
 +                      unlock_page(page);
 +                      ret = VM_FAULT_SIGBUS;
 +                      goto out;
 +              }
 +              ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 +      }
 +      ext4_journal_stop(handle);
 +      if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 +              goto retry_alloc;
 +out_ret:
 +      ret = block_page_mkwrite_return(ret);
 +out:
        return ret;
  }
diff --combined fs/fs-writeback.c
@@@ -35,7 -35,9 +35,9 @@@
  struct wb_writeback_work {
        long nr_pages;
        struct super_block *sb;
+       unsigned long *older_than_this;
        enum writeback_sync_modes sync_mode;
+       unsigned int tagged_writepages:1;
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
@@@ -180,12 -182,13 +182,13 @@@ void bdi_start_background_writeback(str
   */
  void inode_wb_list_del(struct inode *inode)
  {
-       spin_lock(&inode_wb_list_lock);
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
+       spin_lock(&bdi->wb.list_lock);
        list_del_init(&inode->i_wb_list);
-       spin_unlock(&inode_wb_list_lock);
+       spin_unlock(&bdi->wb.list_lock);
  }
  
  /*
   * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
   * furthest end of its superblock's dirty-inode list.
   * the case then the inode must have been redirtied while it was being written
   * out and we don't reset its dirtied_when.
   */
- static void redirty_tail(struct inode *inode)
+ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
  {
-       struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-       assert_spin_locked(&inode_wb_list_lock);
+       assert_spin_locked(&wb->list_lock);
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;
  
  /*
   * requeue inode for re-scanning after bdi->b_io list is exhausted.
   */
- static void requeue_io(struct inode *inode)
+ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
  {
-       struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-       assert_spin_locked(&inode_wb_list_lock);
+       assert_spin_locked(&wb->list_lock);
        list_move(&inode->i_wb_list, &wb->b_more_io);
  }
  
@@@ -225,7 -224,7 +224,7 @@@ static void inode_sync_complete(struct 
  {
        /*
         * Prevent speculative execution through
-        * spin_unlock(&inode_wb_list_lock);
+        * spin_unlock(&wb->list_lock);
         */
  
        smp_mb();
@@@ -250,15 -249,16 +249,16 @@@ static bool inode_dirtied_after(struct 
  /*
   * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
   */
- static void move_expired_inodes(struct list_head *delaying_queue,
+ static int move_expired_inodes(struct list_head *delaying_queue,
                               struct list_head *dispatch_queue,
-                               unsigned long *older_than_this)
+                              unsigned long *older_than_this)
  {
        LIST_HEAD(tmp);
        struct list_head *pos, *node;
        struct super_block *sb = NULL;
        struct inode *inode;
        int do_sb_sort = 0;
+       int moved = 0;
  
        while (!list_empty(delaying_queue)) {
                inode = wb_inode(delaying_queue->prev);
                        do_sb_sort = 1;
                sb = inode->i_sb;
                list_move(&inode->i_wb_list, &tmp);
+               moved++;
        }
  
        /* just one sb in list, splice to dispatch_queue and we're done */
        if (!do_sb_sort) {
                list_splice(&tmp, dispatch_queue);
-               return;
+               goto out;
        }
  
        /* Move inodes from one superblock together */
                                list_move(&inode->i_wb_list, dispatch_queue);
                }
        }
+ out:
+       return moved;
  }
  
  /*
   */
  static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
  {
-       assert_spin_locked(&inode_wb_list_lock);
+       int moved;
+       assert_spin_locked(&wb->list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
-       move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+       moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+       trace_writeback_queue_io(wb, older_than_this, moved);
  }
  
  static int write_inode(struct inode *inode, struct writeback_control *wbc)
  /*
   * Wait for writeback on an inode to complete.
   */
- static void inode_wait_for_writeback(struct inode *inode)
+ static void inode_wait_for_writeback(struct inode *inode,
+                                    struct bdi_writeback *wb)
  {
        DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
        wait_queue_head_t *wqh;
        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
        while (inode->i_state & I_SYNC) {
                spin_unlock(&inode->i_lock);
-               spin_unlock(&inode_wb_list_lock);
+               spin_unlock(&wb->list_lock);
                __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
-               spin_lock(&inode_wb_list_lock);
+               spin_lock(&wb->list_lock);
                spin_lock(&inode->i_lock);
        }
  }
  
  /*
-  * Write out an inode's dirty pages.  Called under inode_wb_list_lock and
+  * Write out an inode's dirty pages.  Called under wb->list_lock and
   * inode->i_lock.  Either the caller has an active reference on the inode or
   * the inode has I_WILL_FREE set.
   *
   * livelocks, etc.
   */
  static int
- writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
+                      struct writeback_control *wbc)
  {
        struct address_space *mapping = inode->i_mapping;
+       long nr_to_write = wbc->nr_to_write;
        unsigned dirty;
        int ret;
  
-       assert_spin_locked(&inode_wb_list_lock);
+       assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
  
        if (!atomic_read(&inode->i_count))
                 * completed a full scan of b_io.
                 */
                if (wbc->sync_mode != WB_SYNC_ALL) {
-                       requeue_io(inode);
+                       requeue_io(inode, wb);
+                       trace_writeback_single_inode_requeue(inode, wbc,
+                                                            nr_to_write);
                        return 0;
                }
  
                /*
                 * It's a data-integrity sync.  We must wait.
                 */
-               inode_wait_for_writeback(inode);
+               inode_wait_for_writeback(inode, wb);
        }
  
        BUG_ON(inode->i_state & I_SYNC);
        inode->i_state |= I_SYNC;
        inode->i_state &= ~I_DIRTY_PAGES;
        spin_unlock(&inode->i_lock);
-       spin_unlock(&inode_wb_list_lock);
+       spin_unlock(&wb->list_lock);
  
        ret = do_writepages(mapping, wbc);
  
                        ret = err;
        }
  
-       spin_lock(&inode_wb_list_lock);
+       spin_lock(&wb->list_lock);
        spin_lock(&inode->i_lock);
        inode->i_state &= ~I_SYNC;
        if (!(inode->i_state & I_FREEING)) {
+               /*
+                * Sync livelock prevention. Each inode is tagged and synced in
+                * one shot. If still dirty, it will be redirty_tail()'ed below.
+                * Update the dirty time to prevent enqueue and sync it again.
+                */
+               if ((inode->i_state & I_DIRTY) &&
+                   (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
+                       inode->dirtied_when = jiffies;
                if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                        /*
                         * We didn't write back all the pages.  nfs_writepages()
                                /*
                                 * slice used up: queue for next turn
                                 */
-                               requeue_io(inode);
+                               requeue_io(inode, wb);
                        } else {
                                /*
                                 * Writeback blocked by something other than
                                 * retrying writeback of the dirty page/inode
                                 * that cannot be performed immediately.
                                 */
-                               redirty_tail(inode);
+                               redirty_tail(inode, wb);
                        }
                } else if (inode->i_state & I_DIRTY) {
                        /*
                         * submission or metadata updates after data IO
                         * completion.
                         */
-                       redirty_tail(inode);
+                       redirty_tail(inode, wb);
                } else {
                        /*
                         * The inode is clean.  At this point we either have
                }
        }
        inode_sync_complete(inode);
+       trace_writeback_single_inode(inode, wbc, nr_to_write);
        return ret;
  }
  
 -/*
 - * For background writeback the caller does not have the sb pinned
 - * before calling writeback. So make sure that we do pin it, so it doesn't
 - * go away while we are writing inodes from it.
 - */
 -static bool pin_sb_for_writeback(struct super_block *sb)
 -{
 -      spin_lock(&sb_lock);
 -      if (list_empty(&sb->s_instances)) {
 -              spin_unlock(&sb_lock);
 -              return false;
 -      }
 -
 -      sb->s_count++;
 -      spin_unlock(&sb_lock);
 -
 -      if (down_read_trylock(&sb->s_umount)) {
 -              if (sb->s_root)
 -                      return true;
 -              up_read(&sb->s_umount);
 -      }
 -
 -      put_super(sb);
 -      return false;
 -}
 -
+ static long writeback_chunk_size(struct backing_dev_info *bdi,
+                                struct wb_writeback_work *work)
+ {
+       long pages;
+       /*
+        * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+        * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+        * here avoids calling into writeback_inodes_wb() more than once.
+        *
+        * The intended call sequence for WB_SYNC_ALL writeback is:
+        *
+        *      wb_writeback()
+        *          writeback_sb_inodes()       <== called only once
+        *              write_cache_pages()     <== called once for each inode
+        *                   (quickly) tag currently dirty pages
+        *                   (maybe slowly) sync all tagged pages
+        */
+       if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
+               pages = LONG_MAX;
+       else {
+               pages = min(bdi->avg_write_bandwidth / 2,
+                           global_dirty_limit / DIRTY_SCOPE);
+               pages = min(pages, work->nr_pages);
+               pages = round_down(pages + MIN_WRITEBACK_PAGES,
+                                  MIN_WRITEBACK_PAGES);
+       }
+       return pages;
+ }
  /*
   * Write a portion of b_io inodes which belong to @sb.
   *
   * inodes. Otherwise write only ones which go sequentially
   * in reverse order.
   *
-  * Return 1, if the caller writeback routine should be
-  * interrupted. Otherwise return 0.
+  * Return the number of pages and/or inodes written.
   */
- static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
-               struct writeback_control *wbc, bool only_this_sb)
+ static long writeback_sb_inodes(struct super_block *sb,
+                               struct bdi_writeback *wb,
+                               struct wb_writeback_work *work)
  {
+       struct writeback_control wbc = {
+               .sync_mode              = work->sync_mode,
+               .tagged_writepages      = work->tagged_writepages,
+               .for_kupdate            = work->for_kupdate,
+               .for_background         = work->for_background,
+               .range_cyclic           = work->range_cyclic,
+               .range_start            = 0,
+               .range_end              = LLONG_MAX,
+       };
+       unsigned long start_time = jiffies;
+       long write_chunk;
+       long wrote = 0;  /* count both pages and inodes */
        while (!list_empty(&wb->b_io)) {
-               long pages_skipped;
                struct inode *inode = wb_inode(wb->b_io.prev);
  
                if (inode->i_sb != sb) {
-                       if (only_this_sb) {
+                       if (work->sb) {
                                /*
                                 * We only want to write back data for this
                                 * superblock, move all inodes not belonging
                                 * to it back onto the dirty list.
                                 */
-                               redirty_tail(inode);
+                               redirty_tail(inode, wb);
                                continue;
                        }
  
                         * Bounce back to the caller to unpin this and
                         * pin the next superblock.
                         */
-                       return 0;
+                       break;
                }
  
                /*
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
-                       requeue_io(inode);
+                       redirty_tail(inode, wb);
                        continue;
                }
-               /*
-                * Was this inode dirtied after sync_sb_inodes was called?
-                * This keeps sync from extra jobs and livelock.
-                */
-               if (inode_dirtied_after(inode, wbc->wb_start)) {
-                       spin_unlock(&inode->i_lock);
-                       return 1;
-               }
                __iget(inode);
+               write_chunk = writeback_chunk_size(wb->bdi, work);
+               wbc.nr_to_write = write_chunk;
+               wbc.pages_skipped = 0;
  
-               pages_skipped = wbc->pages_skipped;
-               writeback_single_inode(inode, wbc);
-               if (wbc->pages_skipped != pages_skipped) {
+               writeback_single_inode(inode, wb, &wbc);
+               work->nr_pages -= write_chunk - wbc.nr_to_write;
+               wrote += write_chunk - wbc.nr_to_write;
+               if (!(inode->i_state & I_DIRTY))
+                       wrote++;
+               if (wbc.pages_skipped) {
                        /*
                         * writeback is not making progress due to locked
                         * buffers.  Skip this inode for now.
                         */
-                       redirty_tail(inode);
+                       redirty_tail(inode, wb);
                }
                spin_unlock(&inode->i_lock);
-               spin_unlock(&inode_wb_list_lock);
+               spin_unlock(&wb->list_lock);
                iput(inode);
                cond_resched();
-               spin_lock(&inode_wb_list_lock);
-               if (wbc->nr_to_write <= 0) {
-                       wbc->more_io = 1;
-                       return 1;
+               spin_lock(&wb->list_lock);
+               /*
+                * bail out to wb_writeback() often enough to check
+                * background threshold and other termination conditions.
+                */
+               if (wrote) {
+                       if (time_is_before_jiffies(start_time + HZ / 10UL))
+                               break;
+                       if (work->nr_pages <= 0)
+                               break;
                }
-               if (!list_empty(&wb->b_more_io))
-                       wbc->more_io = 1;
        }
-       /* b_io is empty */
-       return 1;
+       return wrote;
  }
  
void writeback_inodes_wb(struct bdi_writeback *wb,
-               struct writeback_control *wbc)
static long __writeback_inodes_wb(struct bdi_writeback *wb,
+                                 struct wb_writeback_work *work)
  {
-       int ret = 0;
-       if (!wbc->wb_start)
-               wbc->wb_start = jiffies; /* livelock avoidance */
-       spin_lock(&inode_wb_list_lock);
-       if (!wbc->for_kupdate || list_empty(&wb->b_io))
-               queue_io(wb, wbc->older_than_this);
+       unsigned long start_time = jiffies;
+       long wrote = 0;
  
        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
                struct super_block *sb = inode->i_sb;
  
 -              if (!pin_sb_for_writeback(sb)) {
 +              if (!grab_super_passive(sb)) {
-                       requeue_io(inode);
+                       requeue_io(inode, wb);
                        continue;
                }
-               ret = writeback_sb_inodes(sb, wb, wbc, false);
+               wrote += writeback_sb_inodes(sb, wb, work);
                drop_super(sb);
  
-               if (ret)
-                       break;
+               /* refer to the same tests at the end of writeback_sb_inodes */
+               if (wrote) {
+                       if (time_is_before_jiffies(start_time + HZ / 10UL))
+                               break;
+                       if (work->nr_pages <= 0)
+                               break;
+               }
        }
-       spin_unlock(&inode_wb_list_lock);
        /* Leave any unwritten inodes on b_io */
+       return wrote;
  }
  
- static void __writeback_inodes_sb(struct super_block *sb,
-               struct bdi_writeback *wb, struct writeback_control *wbc)
+ long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
  {
-       WARN_ON(!rwsem_is_locked(&sb->s_umount));
+       struct wb_writeback_work work = {
+               .nr_pages       = nr_pages,
+               .sync_mode      = WB_SYNC_NONE,
+               .range_cyclic   = 1,
+       };
  
-       spin_lock(&inode_wb_list_lock);
-       if (!wbc->for_kupdate || list_empty(&wb->b_io))
-               queue_io(wb, wbc->older_than_this);
-       writeback_sb_inodes(sb, wb, wbc, true);
-       spin_unlock(&inode_wb_list_lock);
- }
+       spin_lock(&wb->list_lock);
+       if (list_empty(&wb->b_io))
+               queue_io(wb, NULL);
+       __writeback_inodes_wb(wb, &work);
+       spin_unlock(&wb->list_lock);
  
- /*
-  * The maximum number of pages to writeout in a single bdi flush/kupdate
-  * operation.  We do this so we don't hold I_SYNC against an inode for
-  * enormous amounts of time, which would block a userspace task which has
-  * been forced to throttle against that inode.  Also, the code reevaluates
-  * the dirty each time it has written this many pages.
-  */
- #define MAX_WRITEBACK_PAGES     1024
+       return nr_pages - work.nr_pages;
+ }
  
  static inline bool over_bground_thresh(void)
  {
                global_page_state(NR_UNSTABLE_NFS) > background_thresh);
  }
  
+ /*
+  * Called under wb->list_lock. If there are multiple wb per bdi,
+  * only the flusher working on the first wb should do it.
+  */
+ static void wb_update_bandwidth(struct bdi_writeback *wb,
+                               unsigned long start_time)
+ {
+       __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
+ }
  /*
   * Explicit flushing or periodic writeback of "old" data.
   *
  static long wb_writeback(struct bdi_writeback *wb,
                         struct wb_writeback_work *work)
  {
-       struct writeback_control wbc = {
-               .sync_mode              = work->sync_mode,
-               .older_than_this        = NULL,
-               .for_kupdate            = work->for_kupdate,
-               .for_background         = work->for_background,
-               .range_cyclic           = work->range_cyclic,
-       };
+       unsigned long wb_start = jiffies;
+       long nr_pages = work->nr_pages;
        unsigned long oldest_jif;
-       long wrote = 0;
-       long write_chunk;
        struct inode *inode;
+       long progress;
  
-       if (wbc.for_kupdate) {
-               wbc.older_than_this = &oldest_jif;
-               oldest_jif = jiffies -
-                               msecs_to_jiffies(dirty_expire_interval * 10);
-       }
-       if (!wbc.range_cyclic) {
-               wbc.range_start = 0;
-               wbc.range_end = LLONG_MAX;
-       }
+       oldest_jif = jiffies;
+       work->older_than_this = &oldest_jif;
  
-       /*
-        * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
-        * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
-        * here avoids calling into writeback_inodes_wb() more than once.
-        *
-        * The intended call sequence for WB_SYNC_ALL writeback is:
-        *
-        *      wb_writeback()
-        *          __writeback_inodes_sb()     <== called only once
-        *              write_cache_pages()     <== called once for each inode
-        *                   (quickly) tag currently dirty pages
-        *                   (maybe slowly) sync all tagged pages
-        */
-       if (wbc.sync_mode == WB_SYNC_NONE)
-               write_chunk = MAX_WRITEBACK_PAGES;
-       else
-               write_chunk = LONG_MAX;
-       wbc.wb_start = jiffies; /* livelock avoidance */
+       spin_lock(&wb->list_lock);
        for (;;) {
                /*
                 * Stop writeback when nr_pages has been consumed
                if (work->for_background && !over_bground_thresh())
                        break;
  
-               wbc.more_io = 0;
-               wbc.nr_to_write = write_chunk;
-               wbc.pages_skipped = 0;
+               if (work->for_kupdate) {
+                       oldest_jif = jiffies -
+                               msecs_to_jiffies(dirty_expire_interval * 10);
+                       work->older_than_this = &oldest_jif;
+               }
  
-               trace_wbc_writeback_start(&wbc, wb->bdi);
+               trace_writeback_start(wb->bdi, work);
+               if (list_empty(&wb->b_io))
+                       queue_io(wb, work->older_than_this);
                if (work->sb)
-                       __writeback_inodes_sb(work->sb, wb, &wbc);
+                       progress = writeback_sb_inodes(work->sb, wb, work);
                else
-                       writeback_inodes_wb(wb, &wbc);
-               trace_wbc_writeback_written(&wbc, wb->bdi);
+                       progress = __writeback_inodes_wb(wb, work);
+               trace_writeback_written(wb->bdi, work);
  
-               work->nr_pages -= write_chunk - wbc.nr_to_write;
-               wrote += write_chunk - wbc.nr_to_write;
+               wb_update_bandwidth(wb, wb_start);
  
                /*
-                * If we consumed everything, see if we have more
+                * Did we write something? Try for more
+                *
+                * Dirty inodes are moved to b_io for writeback in batches.
+                * The completion of the current batch does not necessarily
+                * mean the overall work is done. So we keep looping as long
+                * as made some progress on cleaning pages or inodes.
                 */
-               if (wbc.nr_to_write <= 0)
+               if (progress)
                        continue;
                /*
-                * Didn't write everything and we don't have more IO, bail
+                * No more inodes for IO, bail
                 */
-               if (!wbc.more_io)
+               if (list_empty(&wb->b_more_io))
                        break;
-               /*
-                * Did we write something? Try for more
-                */
-               if (wbc.nr_to_write < write_chunk)
-                       continue;
                /*
                 * Nothing written. Wait for some inode to
                 * become available for writeback. Otherwise
                 * we'll just busyloop.
                 */
-               spin_lock(&inode_wb_list_lock);
                if (!list_empty(&wb->b_more_io))  {
+                       trace_writeback_wait(wb->bdi, work);
                        inode = wb_inode(wb->b_more_io.prev);
-                       trace_wbc_writeback_wait(&wbc, wb->bdi);
                        spin_lock(&inode->i_lock);
-                       inode_wait_for_writeback(inode);
+                       inode_wait_for_writeback(inode, wb);
                        spin_unlock(&inode->i_lock);
                }
-               spin_unlock(&inode_wb_list_lock);
        }
+       spin_unlock(&wb->list_lock);
  
-       return wrote;
+       return nr_pages - work->nr_pages;
  }
  
  /*
@@@ -1063,10 -1129,10 +1103,10 @@@ void __mark_inode_dirty(struct inode *i
                        }
  
                        spin_unlock(&inode->i_lock);
-                       spin_lock(&inode_wb_list_lock);
+                       spin_lock(&bdi->wb.list_lock);
                        inode->dirtied_when = jiffies;
                        list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
-                       spin_unlock(&inode_wb_list_lock);
+                       spin_unlock(&bdi->wb.list_lock);
  
                        if (wakeup_bdi)
                                bdi_wakeup_thread_delayed(bdi);
@@@ -1162,10 -1228,11 +1202,11 @@@ void writeback_inodes_sb_nr(struct supe
  {
        DECLARE_COMPLETION_ONSTACK(done);
        struct wb_writeback_work work = {
-               .sb             = sb,
-               .sync_mode      = WB_SYNC_NONE,
-               .done           = &done,
-               .nr_pages       = nr,
+               .sb                     = sb,
+               .sync_mode              = WB_SYNC_NONE,
+               .tagged_writepages      = 1,
+               .done                   = &done,
+               .nr_pages               = nr,
        };
  
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@@ -1267,6 -1334,7 +1308,7 @@@ EXPORT_SYMBOL(sync_inodes_sb)
   */
  int write_inode_now(struct inode *inode, int sync)
  {
+       struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
        int ret;
        struct writeback_control wbc = {
                .nr_to_write = LONG_MAX,
                wbc.nr_to_write = 0;
  
        might_sleep();
-       spin_lock(&inode_wb_list_lock);
+       spin_lock(&wb->list_lock);
        spin_lock(&inode->i_lock);
-       ret = writeback_single_inode(inode, &wbc);
+       ret = writeback_single_inode(inode, wb, &wbc);
        spin_unlock(&inode->i_lock);
-       spin_unlock(&inode_wb_list_lock);
+       spin_unlock(&wb->list_lock);
        if (sync)
                inode_sync_wait(inode);
        return ret;
@@@ -1303,13 -1371,14 +1345,14 @@@ EXPORT_SYMBOL(write_inode_now)
   */
  int sync_inode(struct inode *inode, struct writeback_control *wbc)
  {
+       struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
        int ret;
  
-       spin_lock(&inode_wb_list_lock);
+       spin_lock(&wb->list_lock);
        spin_lock(&inode->i_lock);
-       ret = writeback_single_inode(inode, wbc);
+       ret = writeback_single_inode(inode, wb, wbc);
        spin_unlock(&inode->i_lock);
-       spin_unlock(&inode_wb_list_lock);
+       spin_unlock(&wb->list_lock);
        return ret;
  }
  EXPORT_SYMBOL(sync_inode);
diff --combined fs/inode.c
   *
   * inode->i_lock protects:
   *   inode->i_state, inode->i_hash, __iget()
 - * inode_lru_lock protects:
 - *   inode_lru, inode->i_lru
 + * inode->i_sb->s_inode_lru_lock protects:
 + *   inode->i_sb->s_inode_lru, inode->i_lru
   * inode_sb_list_lock protects:
   *   sb->s_inodes, inode->i_sb_list
-  * inode_wb_list_lock protects:
+  * bdi->wb.list_lock protects:
   *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
   * inode_hash_lock protects:
   *   inode_hashtable, inode->i_hash
@@@ -46,9 -46,9 +46,9 @@@
   *
   * inode_sb_list_lock
   *   inode->i_lock
 - *     inode_lru_lock
 + *     inode->i_sb->s_inode_lru_lock
   *
-  * inode_wb_list_lock
+  * bdi->wb.list_lock
   *   inode->i_lock
   *
   * inode_hash_lock
@@@ -64,9 -64,22 +64,8 @@@ static unsigned int i_hash_shift __read
  static struct hlist_head *inode_hashtable __read_mostly;
  static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
  
 -static LIST_HEAD(inode_lru);
 -static DEFINE_SPINLOCK(inode_lru_lock);
 -
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
  
 -/*
 - * iprune_sem provides exclusion between the icache shrinking and the
 - * umount path.
 - *
 - * We don't actually need it to protect anything in the umount path,
 - * but only need to cycle through it to make sure any inode that
 - * prune_icache took off the LRU list has been fully torn down by the
 - * time we are past evict_inodes.
 - */
 -static DECLARE_RWSEM(iprune_sem);
 -
  /*
   * Empty aops. Can be used for the cases where the user does not
   * define any of the address_space operations.
@@@ -81,7 -94,6 +80,7 @@@ EXPORT_SYMBOL(empty_aops)
  struct inodes_stat_t inodes_stat;
  
  static DEFINE_PER_CPU(unsigned int, nr_inodes);
 +static DEFINE_PER_CPU(unsigned int, nr_unused);
  
  static struct kmem_cache *inode_cachep __read_mostly;
  
@@@ -96,11 -108,7 +95,11 @@@ static int get_nr_inodes(void
  
  static inline int get_nr_inodes_unused(void)
  {
 -      return inodes_stat.nr_unused;
 +      int i;
 +      int sum = 0;
 +      for_each_possible_cpu(i)
 +              sum += per_cpu(nr_unused, i);
 +      return sum < 0 ? 0 : sum;
  }
  
  int get_nr_dirty_inodes(void)
@@@ -118,7 -126,6 +117,7 @@@ int proc_nr_inodes(ctl_table *table, in
                   void __user *buffer, size_t *lenp, loff_t *ppos)
  {
        inodes_stat.nr_inodes = get_nr_inodes();
 +      inodes_stat.nr_unused = get_nr_inodes_unused();
        return proc_dointvec(table, write, buffer, lenp, ppos);
  }
  #endif
@@@ -168,7 -175,8 +167,7 @@@ int inode_init_always(struct super_bloc
        mutex_init(&inode->i_mutex);
        lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
  
 -      init_rwsem(&inode->i_alloc_sem);
 -      lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
 +      atomic_set(&inode->i_dio_count, 0);
  
        mapping->a_ops = &empty_aops;
        mapping->host = inode;
@@@ -328,24 -336,22 +327,24 @@@ EXPORT_SYMBOL(ihold)
  
  static void inode_lru_list_add(struct inode *inode)
  {
 -      spin_lock(&inode_lru_lock);
 +      spin_lock(&inode->i_sb->s_inode_lru_lock);
        if (list_empty(&inode->i_lru)) {
 -              list_add(&inode->i_lru, &inode_lru);
 -              inodes_stat.nr_unused++;
 +              list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
 +              inode->i_sb->s_nr_inodes_unused++;
 +              this_cpu_inc(nr_unused);
        }
 -      spin_unlock(&inode_lru_lock);
 +      spin_unlock(&inode->i_sb->s_inode_lru_lock);
  }
  
  static void inode_lru_list_del(struct inode *inode)
  {
 -      spin_lock(&inode_lru_lock);
 +      spin_lock(&inode->i_sb->s_inode_lru_lock);
        if (!list_empty(&inode->i_lru)) {
                list_del_init(&inode->i_lru);
 -              inodes_stat.nr_unused--;
 +              inode->i_sb->s_nr_inodes_unused--;
 +              this_cpu_dec(nr_unused);
        }
 -      spin_unlock(&inode_lru_lock);
 +      spin_unlock(&inode->i_sb->s_inode_lru_lock);
  }
  
  /**
@@@ -416,14 -422,7 +415,14 @@@ EXPORT_SYMBOL(remove_inode_hash)
  void end_writeback(struct inode *inode)
  {
        might_sleep();
 +      /*
 +       * We have to cycle tree_lock here because reclaim can be still in the
 +       * process of removing the last page (in __delete_from_page_cache())
 +       * and we must not free mapping under it.
 +       */
 +      spin_lock_irq(&inode->i_data.tree_lock);
        BUG_ON(inode->i_data.nrpages);
 +      spin_unlock_irq(&inode->i_data.tree_lock);
        BUG_ON(!list_empty(&inode->i_data.private_list));
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
@@@ -530,6 -529,14 +529,6 @@@ void evict_inodes(struct super_block *s
        spin_unlock(&inode_sb_list_lock);
  
        dispose_list(&dispose);
 -
 -      /*
 -       * Cycle through iprune_sem to make sure any inode that prune_icache
 -       * moved off the list before we took the lock has been fully torn
 -       * down.
 -       */
 -      down_write(&iprune_sem);
 -      up_write(&iprune_sem);
  }
  
  /**
@@@ -592,10 -599,8 +591,10 @@@ static int can_unuse(struct inode *inod
  }
  
  /*
 - * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
 - * temporary list and then are freed outside inode_lru_lock by dispose_list().
 + * Walk the superblock inode LRU for freeable inodes and attempt to free them.
 + * This is called from the superblock shrinker function with a number of inodes
 + * to trim from the LRU. Inodes to be freed are moved to a temporary list and
 + * then are freed outside inode_lock by dispose_list().
   *
   * Any inodes which are pinned purely because of attached pagecache have their
   * pagecache removed.  If the inode has metadata buffers attached to
   * LRU does not have strict ordering. Hence we don't want to reclaim inodes
   * with this flag set because they are the inodes that are out of order.
   */
 -static void prune_icache(int nr_to_scan)
 +void prune_icache_sb(struct super_block *sb, int nr_to_scan)
  {
        LIST_HEAD(freeable);
        int nr_scanned;
        unsigned long reap = 0;
  
 -      down_read(&iprune_sem);
 -      spin_lock(&inode_lru_lock);
 -      for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 +      spin_lock(&sb->s_inode_lru_lock);
 +      for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
                struct inode *inode;
  
 -              if (list_empty(&inode_lru))
 +              if (list_empty(&sb->s_inode_lru))
                        break;
  
 -              inode = list_entry(inode_lru.prev, struct inode, i_lru);
 +              inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
  
                /*
 -               * we are inverting the inode_lru_lock/inode->i_lock here,
 +               * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
                 * so use a trylock. If we fail to get the lock, just move the
                 * inode to the back of the list so we don't spin on it.
                 */
                if (!spin_trylock(&inode->i_lock)) {
 -                      list_move(&inode->i_lru, &inode_lru);
 +                      list_move(&inode->i_lru, &sb->s_inode_lru);
                        continue;
                }
  
                    (inode->i_state & ~I_REFERENCED)) {
                        list_del_init(&inode->i_lru);
                        spin_unlock(&inode->i_lock);
 -                      inodes_stat.nr_unused--;
 +                      sb->s_nr_inodes_unused--;
 +                      this_cpu_dec(nr_unused);
                        continue;
                }
  
                /* recently referenced inodes get one more pass */
                if (inode->i_state & I_REFERENCED) {
                        inode->i_state &= ~I_REFERENCED;
 -                      list_move(&inode->i_lru, &inode_lru);
 +                      list_move(&inode->i_lru, &sb->s_inode_lru);
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if (inode_has_buffers(inode) || inode->i_data.nrpages) {
                        __iget(inode);
                        spin_unlock(&inode->i_lock);
 -                      spin_unlock(&inode_lru_lock);
 +                      spin_unlock(&sb->s_inode_lru_lock);
                        if (remove_inode_buffers(inode))
                                reap += invalidate_mapping_pages(&inode->i_data,
                                                                0, -1);
                        iput(inode);
 -                      spin_lock(&inode_lru_lock);
 +                      spin_lock(&sb->s_inode_lru_lock);
  
 -                      if (inode != list_entry(inode_lru.next,
 +                      if (inode != list_entry(sb->s_inode_lru.next,
                                                struct inode, i_lru))
                                continue;       /* wrong inode or list_empty */
                        /* avoid lock inversions with trylock */
                spin_unlock(&inode->i_lock);
  
                list_move(&inode->i_lru, &freeable);
 -              inodes_stat.nr_unused--;
 +              sb->s_nr_inodes_unused--;
 +              this_cpu_dec(nr_unused);
        }
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_INODESTEAL, reap);
        else
                __count_vm_events(PGINODESTEAL, reap);
 -      spin_unlock(&inode_lru_lock);
 +      spin_unlock(&sb->s_inode_lru_lock);
  
        dispose_list(&freeable);
 -      up_read(&iprune_sem);
  }
  
 -/*
 - * shrink_icache_memory() will attempt to reclaim some unused inodes.  Here,
 - * "unused" means that no dentries are referring to the inodes: the files are
 - * not open and the dcache references to those inodes have already been
 - * reclaimed.
 - *
 - * This function is passed the number of inodes to scan, and it returns the
 - * total number of remaining possibly-reclaimable inodes.
 - */
 -static int shrink_icache_memory(struct shrinker *shrink,
 -                              struct shrink_control *sc)
 -{
 -      int nr = sc->nr_to_scan;
 -      gfp_t gfp_mask = sc->gfp_mask;
 -
 -      if (nr) {
 -              /*
 -               * Nasty deadlock avoidance.  We may hold various FS locks,
 -               * and we don't want to recurse into the FS that called us
 -               * in clear_inode() and friends..
 -               */
 -              if (!(gfp_mask & __GFP_FS))
 -                      return -1;
 -              prune_icache(nr);
 -      }
 -      return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
 -}
 -
 -static struct shrinker icache_shrinker = {
 -      .shrink = shrink_icache_memory,
 -      .seeks = DEFAULT_SEEKS,
 -};
 -
  static void __wait_on_freeing_inode(struct inode *inode);
  /*
   * Called with the inode lock held.
@@@ -1285,7 -1323,7 +1284,7 @@@ static void iput_final(struct inode *in
  
        WARN_ON(inode->i_state & I_NEW);
  
 -      if (op && op->drop_inode)
 +      if (op->drop_inode)
                drop = op->drop_inode(inode);
        else
                drop = generic_drop_inode(inode);
@@@ -1571,6 -1609,7 +1570,6 @@@ void __init inode_init(void
                                         (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
                                         SLAB_MEM_SPREAD),
                                         init_once);
 -      register_shrinker(&icache_shrinker);
  
        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
diff --combined fs/nfs/write.c
@@@ -409,7 -409,7 +409,7 @@@ out
   */
  static void nfs_inode_remove_request(struct nfs_page *req)
  {
 -      struct inode *inode = req->wb_context->path.dentry->d_inode;
 +      struct inode *inode = req->wb_context->dentry->d_inode;
        struct nfs_inode *nfsi = NFS_I(inode);
  
        BUG_ON (!NFS_WBACK_BUSY(req));
@@@ -438,7 -438,7 +438,7 @@@ nfs_mark_request_dirty(struct nfs_page 
  static void
  nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
  {
 -      struct inode *inode = req->wb_context->path.dentry->d_inode;
 +      struct inode *inode = req->wb_context->dentry->d_inode;
        struct nfs_inode *nfsi = NFS_I(inode);
  
        spin_lock(&inode->i_lock);
@@@ -852,20 -852,18 +852,20 @@@ static int nfs_write_rpcsetup(struct nf
                struct pnfs_layout_segment *lseg,
                int how)
  {
 -      struct inode *inode = req->wb_context->path.dentry->d_inode;
 +      struct inode *inode = req->wb_context->dentry->d_inode;
  
        /* Set up the RPC argument and reply structs
         * NB: take care not to mess about with data->commit et al. */
  
        data->req = req;
 -      data->inode = inode = req->wb_context->path.dentry->d_inode;
 +      data->inode = inode = req->wb_context->dentry->d_inode;
        data->cred = req->wb_context->cred;
        data->lseg = get_lseg(lseg);
  
        data->args.fh     = NFS_FH(inode);
        data->args.offset = req_offset(req) + offset;
 +      /* pnfs_set_layoutcommit needs this */
 +      data->mds_offset = data->args.offset;
        data->args.pgbase = req->wb_pgbase + offset;
        data->args.pages  = data->pagevec;
        data->args.count  = count;
@@@ -1053,9 -1051,9 +1053,9 @@@ static void nfs_writeback_done_partial(
  
        dprintk("NFS: %5u write(%s/%lld %d@%lld)",
                task->tk_pid,
 -              data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
 +              data->req->wb_context->dentry->d_inode->i_sb->s_id,
                (long long)
 -                NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
 +                NFS_FILEID(data->req->wb_context->dentry->d_inode),
                data->req->wb_bytes, (long long)req_offset(data->req));
  
        nfs_writeback_done(task, data);
@@@ -1148,8 -1146,8 +1148,8 @@@ static void nfs_writeback_release_full(
  
                dprintk("NFS: %5u write (%s/%lld %d@%lld)",
                        data->task.tk_pid,
 -                      req->wb_context->path.dentry->d_inode->i_sb->s_id,
 -                      (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
 +                      req->wb_context->dentry->d_inode->i_sb->s_id,
 +                      (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
                        req->wb_bytes,
                        (long long)req_offset(req));
  
@@@ -1347,7 -1345,7 +1347,7 @@@ void nfs_init_commit(struct nfs_write_d
                            struct pnfs_layout_segment *lseg)
  {
        struct nfs_page *first = nfs_list_entry(head->next);
 -      struct inode *inode = first->wb_context->path.dentry->d_inode;
 +      struct inode *inode = first->wb_context->dentry->d_inode;
  
        /* Set up the RPC argument and reply structs
         * NB: take care not to mess about with data->commit et al. */
@@@ -1435,8 -1433,8 +1435,8 @@@ void nfs_commit_release_pages(struct nf
                nfs_clear_request_commit(req);
  
                dprintk("NFS:       commit (%s/%lld %d@%lld)",
 -                      req->wb_context->path.dentry->d_inode->i_sb->s_id,
 -                      (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
 +                      req->wb_context->dentry->d_sb->s_id,
 +                      (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
                        req->wb_bytes,
                        (long long)req_offset(req));
                if (status < 0) {
@@@ -1566,8 -1564,7 +1566,7 @@@ int nfs_write_inode(struct inode *inode
                int status;
                bool sync = true;
  
-               if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
-                   wbc->for_background)
+               if (wbc->sync_mode == WB_SYNC_NONE)
                        sync = false;
  
                status = pnfs_layoutcommit_inode(inode, sync);
@@@ -26,7 -26,7 +26,7 @@@ TRACE_EVENT(ext4_free_inode
                __field(        umode_t, mode                   )
                __field(        uid_t,  uid                     )
                __field(        gid_t,  gid                     )
 -              __field(        blkcnt_t, blocks                )
 +              __field(        __u64, blocks                   )
        ),
  
        TP_fast_assign(
@@@ -40,8 -40,9 +40,8 @@@
  
        TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
 -                (unsigned long) __entry->ino,
 -                __entry->mode, __entry->uid, __entry->gid,
 -                (unsigned long long) __entry->blocks)
 +                (unsigned long) __entry->ino, __entry->mode,
 +                __entry->uid, __entry->gid, __entry->blocks)
  );
  
  TRACE_EVENT(ext4_request_inode,
@@@ -177,7 -178,7 +177,7 @@@ TRACE_EVENT(ext4_begin_ordered_truncate
        TP_printk("dev %d,%d ino %lu new_size %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
 -                (long long) __entry->new_size)
 +                __entry->new_size)
  );
  
  DECLARE_EVENT_CLASS(ext4__write_begin,
                __entry->flags  = flags;
        ),
  
 -      TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u",
 +      TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len, __entry->flags)
@@@ -247,7 -248,7 +247,7 @@@ DECLARE_EVENT_CLASS(ext4__write_end
                __entry->copied = copied;
        ),
  
 -      TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u",
 +      TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len, __entry->copied)
@@@ -285,6 -286,29 +285,6 @@@ DEFINE_EVENT(ext4__write_end, ext4_da_w
        TP_ARGS(inode, pos, len, copied)
  );
  
 -TRACE_EVENT(ext4_writepage,
 -      TP_PROTO(struct inode *inode, struct page *page),
 -
 -      TP_ARGS(inode, page),
 -
 -      TP_STRUCT__entry(
 -              __field(        dev_t,  dev                     )
 -              __field(        ino_t,  ino                     )
 -              __field(        pgoff_t, index                  )
 -
 -      ),
 -
 -      TP_fast_assign(
 -              __entry->dev    = inode->i_sb->s_dev;
 -              __entry->ino    = inode->i_ino;
 -              __entry->index  = page->index;
 -      ),
 -
 -      TP_printk("dev %d,%d ino %lu page_index %lu",
 -                MAJOR(__entry->dev), MINOR(__entry->dev),
 -                (unsigned long) __entry->ino, __entry->index)
 -);
 -
  TRACE_EVENT(ext4_da_writepages,
        TP_PROTO(struct inode *inode, struct writeback_control *wbc),
  
        ),
  
        TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
 -                "range_start %llu range_end %llu sync_mode %d"
 +                "range_start %lld range_end %lld sync_mode %d"
                  "for_kupdate %d range_cyclic %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->nr_to_write,
@@@ -380,7 -404,6 +380,6 @@@ TRACE_EVENT(ext4_da_writepages_result
                __field(        int,    pages_written           )
                __field(        long,   pages_skipped           )
                __field(        int,    sync_mode               )
-               __field(        char,   more_io                 )       
                __field(       pgoff_t, writeback_index         )
        ),
  
                __entry->pages_written  = pages_written;
                __entry->pages_skipped  = wbc->pages_skipped;
                __entry->sync_mode      = wbc->sync_mode;
-               __entry->more_io        = wbc->more_io;
                __entry->writeback_index = inode->i_mapping->writeback_index;
        ),
  
        TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
-                 " more_io %d sync_mode %d writeback_index %lu",
+                 "sync_mode %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->ret,
                  __entry->pages_written, __entry->pages_skipped,
-                 __entry->more_io, __entry->sync_mode,
+                 __entry->sync_mode,
                  (unsigned long) __entry->writeback_index)
  );
  
@@@ -425,14 -447,7 +423,14 @@@ DECLARE_EVENT_CLASS(ext4__page_op
        TP_printk("dev %d,%d ino %lu page_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
 -                __entry->index)
 +                (unsigned long) __entry->index)
 +);
 +
 +DEFINE_EVENT(ext4__page_op, ext4_writepage,
 +
 +      TP_PROTO(struct page *page),
 +
 +      TP_ARGS(page)
  );
  
  DEFINE_EVENT(ext4__page_op, ext4_readpage,
@@@ -472,7 -487,7 +470,7 @@@ TRACE_EVENT(ext4_invalidatepage
        TP_printk("dev %d,%d ino %lu page_index %lu offset %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
 -                __entry->index, __entry->offset)
 +                (unsigned long) __entry->index, __entry->offset)
  );
  
  TRACE_EVENT(ext4_discard_blocks,
@@@ -545,10 -560,12 +543,10 @@@ DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_n
  );
  
  TRACE_EVENT(ext4_mb_release_inode_pa,
 -      TP_PROTO(struct super_block *sb,
 -               struct inode *inode,
 -               struct ext4_prealloc_space *pa,
 +      TP_PROTO(struct ext4_prealloc_space *pa,
                 unsigned long long block, unsigned int count),
  
 -      TP_ARGS(sb, inode, pa, block, count),
 +      TP_ARGS(pa, block, count),
  
        TP_STRUCT__entry(
                __field(        dev_t,  dev                     )
        ),
  
        TP_fast_assign(
 -              __entry->dev            = sb->s_dev;
 -              __entry->ino            = inode->i_ino;
 +              __entry->dev            = pa->pa_inode->i_sb->s_dev;
 +              __entry->ino            = pa->pa_inode->i_ino;
                __entry->block          = block;
                __entry->count          = count;
        ),
  );
  
  TRACE_EVENT(ext4_mb_release_group_pa,
 -      TP_PROTO(struct super_block *sb,
 -               struct ext4_prealloc_space *pa),
 +      TP_PROTO(struct ext4_prealloc_space *pa),
  
 -      TP_ARGS(sb, pa),
 +      TP_ARGS(pa),
  
        TP_STRUCT__entry(
                __field(        dev_t,  dev                     )
        ),
  
        TP_fast_assign(
 -              __entry->dev            = sb->s_dev;
 +              __entry->dev            = pa->pa_inode->i_sb->s_dev;
                __entry->pa_pstart      = pa->pa_pstart;
                __entry->pa_len         = pa->pa_len;
        ),
@@@ -646,10 -664,10 +644,10 @@@ TRACE_EVENT(ext4_request_blocks
                __field(        ino_t,  ino                     )
                __field(        unsigned int, flags             )
                __field(        unsigned int, len               )
 -              __field(        __u64,  logical                 )
 +              __field(        __u32,  logical                 )
 +              __field(        __u32,  lleft                   )
 +              __field(        __u32,  lright                  )
                __field(        __u64,  goal                    )
 -              __field(        __u64,  lleft                   )
 -              __field(        __u64,  lright                  )
                __field(        __u64,  pleft                   )
                __field(        __u64,  pright                  )
        ),
                __entry->pright = ar->pright;
        ),
  
 -      TP_printk("dev %d,%d ino %lu flags %u len %u lblk %llu goal %llu "
 -                "lleft %llu lright %llu pleft %llu pright %llu ",
 +      TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu "
 +                "lleft %u lright %u pleft %llu pright %llu ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
 -                (unsigned long) __entry->ino,
 -                __entry->flags, __entry->len,
 -                (unsigned long long) __entry->logical,
 -                (unsigned long long) __entry->goal,
 -                (unsigned long long) __entry->lleft,
 -                (unsigned long long) __entry->lright,
 -                (unsigned long long) __entry->pleft,
 -                (unsigned long long) __entry->pright)
 +                (unsigned long) __entry->ino, __entry->flags,
 +                __entry->len, __entry->logical, __entry->goal,
 +                __entry->lleft, __entry->lright, __entry->pleft,
 +                __entry->pright)
  );
  
  TRACE_EVENT(ext4_allocate_blocks,
                __field(        __u64,  block                   )
                __field(        unsigned int, flags             )
                __field(        unsigned int, len               )
 -              __field(        __u64,  logical                 )
 +              __field(        __u32,  logical                 )
 +              __field(        __u32,  lleft                   )
 +              __field(        __u32,  lright                  )
                __field(        __u64,  goal                    )
 -              __field(        __u64,  lleft                   )
 -              __field(        __u64,  lright                  )
                __field(        __u64,  pleft                   )
                __field(        __u64,  pright                  )
        ),
                __entry->pright = ar->pright;
        ),
  
 -      TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %llu "
 -                "goal %llu lleft %llu lright %llu pleft %llu pright %llu",
 +      TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u "
 +                "goal %llu lleft %u lright %u pleft %llu pright %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
 -                (unsigned long) __entry->ino,
 -                __entry->flags, __entry->len, __entry->block,
 -                (unsigned long long) __entry->logical,
 -                (unsigned long long) __entry->goal,
 -                (unsigned long long) __entry->lleft,
 -                (unsigned long long) __entry->lright,
 -                (unsigned long long) __entry->pleft,
 -                (unsigned long long) __entry->pright)
 +                (unsigned long) __entry->ino, __entry->flags,
 +                __entry->len, __entry->block, __entry->logical,
 +                __entry->goal,  __entry->lleft, __entry->lright,
 +                __entry->pleft, __entry->pright)
  );
  
  TRACE_EVENT(ext4_free_blocks,
        TP_STRUCT__entry(
                __field(        dev_t,  dev                     )
                __field(        ino_t,  ino                     )
 -              __field(      umode_t, mode                     )
 +              __field(        umode_t, mode                   )
                __field(        __u64,  block                   )
                __field(        unsigned long,  count           )
 -              __field(         int,   flags                   )
 +              __field(        int,    flags                   )
        ),
  
        TP_fast_assign(
@@@ -770,7 -796,7 +768,7 @@@ TRACE_EVENT(ext4_sync_file_enter
                __entry->parent         = dentry->d_parent->d_inode->i_ino;
        ),
  
 -      TP_printk("dev %d,%d ino %ld parent %ld datasync %d ",
 +      TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->parent, __entry->datasync)
@@@ -793,7 -819,7 +791,7 @@@ TRACE_EVENT(ext4_sync_file_exit
                __entry->dev            = inode->i_sb->s_dev;
        ),
  
 -      TP_printk("dev %d,%d ino %ld ret %d",
 +      TP_printk("dev %d,%d ino %lu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->ret)
@@@ -977,7 -1003,7 +975,7 @@@ DECLARE_EVENT_CLASS(ext4__mballoc
                __entry->result_len     = len;
        ),
  
 -      TP_printk("dev %d,%d inode %lu extent %u/%d/%u ",
 +      TP_printk("dev %d,%d inode %lu extent %u/%d/%d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->result_group, __entry->result_start,
@@@ -1065,7 -1091,7 +1063,7 @@@ TRACE_EVENT(ext4_da_update_reserve_spac
                  "allocated_meta_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
 -                __entry->mode,  (unsigned long long) __entry->i_blocks,
 +                __entry->mode, __entry->i_blocks,
                  __entry->used_blocks, __entry->reserved_data_blocks,
                  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
  );
@@@ -1099,7 -1125,7 +1097,7 @@@ TRACE_EVENT(ext4_da_reserve_space
                  "reserved_data_blocks %d reserved_meta_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
 -                __entry->mode, (unsigned long long) __entry->i_blocks,
 +                __entry->mode, __entry->i_blocks,
                  __entry->md_needed, __entry->reserved_data_blocks,
                  __entry->reserved_meta_blocks)
  );
@@@ -1136,7 -1162,7 +1134,7 @@@ TRACE_EVENT(ext4_da_release_space
                  "allocated_meta_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
 -                __entry->mode, (unsigned long long) __entry->i_blocks,
 +                __entry->mode, __entry->i_blocks,
                  __entry->freed_blocks, __entry->reserved_data_blocks,
                  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
  );
@@@ -1211,15 -1237,14 +1209,15 @@@ TRACE_EVENT(ext4_direct_IO_enter
                __entry->rw     = rw;
        ),
  
 -      TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d",
 +      TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
 -                (unsigned long long) __entry->pos, __entry->len, __entry->rw)
 +                __entry->pos, __entry->len, __entry->rw)
  );
  
  TRACE_EVENT(ext4_direct_IO_exit,
 -      TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw, int ret),
 +      TP_PROTO(struct inode *inode, loff_t offset, unsigned long len,
 +               int rw, int ret),
  
        TP_ARGS(inode, offset, len, rw, ret),
  
                __entry->ret    = ret;
        ),
  
 -      TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d ret %d",
 +      TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
 -                (unsigned long long) __entry->pos, __entry->len,
 +                __entry->pos, __entry->len,
                  __entry->rw, __entry->ret)
  );
  
@@@ -1269,15 -1294,15 +1267,15 @@@ TRACE_EVENT(ext4_fallocate_enter
                __entry->mode   = mode;
        ),
  
 -      TP_printk("dev %d,%d ino %ld pos %llu len %llu mode %d",
 +      TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
 -                (unsigned long) __entry->ino,
 -                (unsigned long long) __entry->pos,
 -                (unsigned long long) __entry->len, __entry->mode)
 +                (unsigned long) __entry->ino, __entry->pos,
 +                __entry->len, __entry->mode)
  );
  
  TRACE_EVENT(ext4_fallocate_exit,
 -      TP_PROTO(struct inode *inode, loff_t offset, unsigned int max_blocks, int ret),
 +      TP_PROTO(struct inode *inode, loff_t offset,
 +               unsigned int max_blocks, int ret),
  
        TP_ARGS(inode, offset, max_blocks, ret),
  
                __field(        ino_t,  ino                     )
                __field(        dev_t,  dev                     )
                __field(        loff_t, pos                     )
 -              __field(        unsigned,       blocks          )
 +              __field(        unsigned int,   blocks          )
                __field(        int,    ret                     )
        ),
  
                __entry->ret    = ret;
        ),
  
 -      TP_printk("dev %d,%d ino %ld pos %llu blocks %d ret %d",
 +      TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
 -                (unsigned long long) __entry->pos, __entry->blocks,
 +                __entry->pos, __entry->blocks,
                  __entry->ret)
  );
  
@@@ -1323,7 -1348,7 +1321,7 @@@ TRACE_EVENT(ext4_unlink_enter
                __entry->dev            = dentry->d_inode->i_sb->s_dev;
        ),
  
 -      TP_printk("dev %d,%d ino %ld size %lld parent %ld",
 +      TP_printk("dev %d,%d ino %lu size %lld parent %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->size,
                  (unsigned long) __entry->parent)
@@@ -1346,7 -1371,7 +1344,7 @@@ TRACE_EVENT(ext4_unlink_exit
                __entry->ret            = ret;
        ),
  
 -      TP_printk("dev %d,%d ino %ld ret %d",
 +      TP_printk("dev %d,%d ino %lu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->ret)
@@@ -1360,7 -1385,7 +1358,7 @@@ DECLARE_EVENT_CLASS(ext4__truncate
        TP_STRUCT__entry(
                __field(        ino_t,          ino             )
                __field(        dev_t,          dev             )
 -              __field(        blkcnt_t,       blocks          )
 +              __field(        __u64,          blocks          )
        ),
  
        TP_fast_assign(
                __entry->blocks = inode->i_blocks;
        ),
  
 -      TP_printk("dev %d,%d ino %lu blocks %lu",
 +      TP_printk("dev %d,%d ino %lu blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
 -                (unsigned long) __entry->ino, (unsigned long) __entry->blocks)
 +                (unsigned long) __entry->ino, __entry->blocks)
  );
  
  DEFINE_EVENT(ext4__truncate, ext4_truncate_enter,
@@@ -1390,7 -1415,7 +1388,7 @@@ DEFINE_EVENT(ext4__truncate, ext4_trunc
  
  DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
 -               unsigned len, unsigned flags),
 +               unsigned int len, unsigned int flags),
  
        TP_ARGS(inode, lblk, len, flags),
  
                __field(        ino_t,          ino             )
                __field(        dev_t,          dev             )
                __field(        ext4_lblk_t,    lblk            )
 -              __field(        unsigned,       len             )
 -              __field(        unsigned,       flags           )
 +              __field(        unsigned int,   len             )
 +              __field(        unsigned int,   flags           )
        ),
  
        TP_fast_assign(
        TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
 -                (unsigned) __entry->lblk, __entry->len, __entry->flags)
 +                __entry->lblk, __entry->len, __entry->flags)
  );
  
  DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
@@@ -1432,7 -1457,7 +1430,7 @@@ DEFINE_EVENT(ext4__map_blocks_enter, ex
  
  DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
 -               ext4_fsblk_t pblk, unsigned len, int ret),
 +               ext4_fsblk_t pblk, unsigned int len, int ret),
  
        TP_ARGS(inode, lblk, pblk, len, ret),
  
                __field(        dev_t,          dev             )
                __field(        ext4_lblk_t,    lblk            )
                __field(        ext4_fsblk_t,   pblk            )
 -              __field(        unsigned,       len             )
 +              __field(        unsigned int,   len             )
                __field(        int,            ret             )
        ),
  
        TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
 -                (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
 +                __entry->lblk, __entry->pblk,
                  __entry->len, __entry->ret)
  );
  
@@@ -1497,7 -1522,7 +1495,7 @@@ TRACE_EVENT(ext4_ext_load_extent
        TP_printk("dev %d,%d ino %lu lblk %u pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
 -                (unsigned) __entry->lblk, (unsigned long long) __entry->pblk)
 +                __entry->lblk, __entry->pblk)
  );
  
  TRACE_EVENT(ext4_load_inode,
diff --combined mm/backing-dev.c
@@@ -45,6 -45,17 +45,17 @@@ static struct timer_list sync_supers_ti
  static int bdi_sync_supers(void *);
  static void sync_supers_timer_fn(unsigned long);
  
+ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+ {
+       if (wb1 < wb2) {
+               spin_lock(&wb1->list_lock);
+               spin_lock_nested(&wb2->list_lock, 1);
+       } else {
+               spin_lock(&wb2->list_lock);
+               spin_lock_nested(&wb1->list_lock, 1);
+       }
+ }
  #ifdef CONFIG_DEBUG_FS
  #include <linux/debugfs.h>
  #include <linux/seq_file.h>
@@@ -67,34 -78,42 +78,42 @@@ static int bdi_debug_stats_show(struct 
        struct inode *inode;
  
        nr_dirty = nr_io = nr_more_io = 0;
-       spin_lock(&inode_wb_list_lock);
+       spin_lock(&wb->list_lock);
        list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
                nr_dirty++;
        list_for_each_entry(inode, &wb->b_io, i_wb_list)
                nr_io++;
        list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
                nr_more_io++;
-       spin_unlock(&inode_wb_list_lock);
+       spin_unlock(&wb->list_lock);
  
        global_dirty_limits(&background_thresh, &dirty_thresh);
        bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
  
  #define K(x) ((x) << (PAGE_SHIFT - 10))
        seq_printf(m,
-                  "BdiWriteback:     %8lu kB\n"
-                  "BdiReclaimable:   %8lu kB\n"
-                  "BdiDirtyThresh:   %8lu kB\n"
-                  "DirtyThresh:      %8lu kB\n"
-                  "BackgroundThresh: %8lu kB\n"
-                  "b_dirty:          %8lu\n"
-                  "b_io:             %8lu\n"
-                  "b_more_io:        %8lu\n"
-                  "bdi_list:         %8u\n"
-                  "state:            %8lx\n",
+                  "BdiWriteback:       %10lu kB\n"
+                  "BdiReclaimable:     %10lu kB\n"
+                  "BdiDirtyThresh:     %10lu kB\n"
+                  "DirtyThresh:        %10lu kB\n"
+                  "BackgroundThresh:   %10lu kB\n"
+                  "BdiWritten:         %10lu kB\n"
+                  "BdiWriteBandwidth:  %10lu kBps\n"
+                  "b_dirty:            %10lu\n"
+                  "b_io:               %10lu\n"
+                  "b_more_io:          %10lu\n"
+                  "bdi_list:           %10u\n"
+                  "state:              %10lx\n",
                   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
                   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
-                  K(bdi_thresh), K(dirty_thresh),
-                  K(background_thresh), nr_dirty, nr_io, nr_more_io,
+                  K(bdi_thresh),
+                  K(dirty_thresh),
+                  K(background_thresh),
+                  (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+                  (unsigned long) K(bdi->write_bandwidth),
+                  nr_dirty,
+                  nr_io,
+                  nr_more_io,
                   !list_empty(&bdi->bdi_list), bdi->state);
  #undef K
  
@@@ -249,18 -268,6 +268,6 @@@ int bdi_has_dirty_io(struct backing_dev
        return wb_has_dirty_io(&bdi->wb);
  }
  
- static void bdi_flush_io(struct backing_dev_info *bdi)
- {
-       struct writeback_control wbc = {
-               .sync_mode              = WB_SYNC_NONE,
-               .older_than_this        = NULL,
-               .range_cyclic           = 1,
-               .nr_to_write            = 1024,
-       };
-       writeback_inodes_wb(&bdi->wb, &wbc);
- }
  /*
   * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
   * or we risk deadlocking on ->s_umount. The longer term solution would be
@@@ -446,9 -453,10 +453,10 @@@ static int bdi_forker_thread(void *ptr
                        if (IS_ERR(task)) {
                                /*
                                 * If thread creation fails, force writeout of
-                                * the bdi from the thread.
+                                * the bdi from the thread. Hopefully 1024 is
+                                * large enough for efficient IO.
                                 */
-                               bdi_flush_io(bdi);
+                               writeback_inodes_wb(&bdi->wb, 1024);
                        } else {
                                /*
                                 * The spinlock makes sure we do not lose
@@@ -505,7 -513,7 +513,7 @@@ static void bdi_remove_from_list(struc
        list_del_rcu(&bdi->bdi_list);
        spin_unlock_bh(&bdi_lock);
  
 -      synchronize_rcu();
 +      synchronize_rcu_expedited();
  }
  
  int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@@ -606,7 -614,6 +614,7 @@@ static void bdi_prune_sb(struct backing
  void bdi_unregister(struct backing_dev_info *bdi)
  {
        if (bdi->dev) {
 +              bdi_set_min_ratio(bdi, 0);
                trace_writeback_bdi_unregister(bdi);
                bdi_prune_sb(bdi);
                del_timer_sync(&bdi->wb.wakeup_timer);
@@@ -629,9 -636,15 +637,15 @@@ static void bdi_wb_init(struct bdi_writ
        INIT_LIST_HEAD(&wb->b_dirty);
        INIT_LIST_HEAD(&wb->b_io);
        INIT_LIST_HEAD(&wb->b_more_io);
+       spin_lock_init(&wb->list_lock);
        setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
  }
  
+ /*
+  * Initial write bandwidth: 100 MB/s
+  */
+ #define INIT_BW               (100 << (20 - PAGE_SHIFT))
  int bdi_init(struct backing_dev_info *bdi)
  {
        int i, err;
        }
  
        bdi->dirty_exceeded = 0;
+       bdi->bw_time_stamp = jiffies;
+       bdi->written_stamp = 0;
+       bdi->write_bandwidth = INIT_BW;
+       bdi->avg_write_bandwidth = INIT_BW;
        err = prop_local_init_percpu(&bdi->completions);
  
        if (err) {
@@@ -677,11 -697,12 +698,12 @@@ void bdi_destroy(struct backing_dev_inf
        if (bdi_has_dirty_io(bdi)) {
                struct bdi_writeback *dst = &default_backing_dev_info.wb;
  
-               spin_lock(&inode_wb_list_lock);
+               bdi_lock_two(&bdi->wb, dst);
                list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
                list_splice(&bdi->wb.b_io, &dst->b_io);
                list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
-               spin_unlock(&inode_wb_list_lock);
+               spin_unlock(&bdi->wb.list_lock);
+               spin_unlock(&dst->list_lock);
        }
  
        bdi_unregister(bdi);
diff --combined mm/filemap.c
   *  ->i_mutex                 (generic_file_buffered_write)
   *    ->mmap_sem              (fault_in_pages_readable->do_page_fault)
   *
-  *  inode_wb_list_lock
 - *  ->i_mutex
 - *    ->i_alloc_sem             (various)
 - *
+  *  bdi->wb.list_lock
   *    sb_lock                 (fs/fs-writeback.c)
   *    ->mapping->tree_lock    (__sync_single_inode)
   *
@@@ -96,9 -99,9 +96,9 @@@
   *    ->zone.lru_lock         (check_pte_range->isolate_lru_page)
   *    ->private_lock          (page_remove_rmap->set_page_dirty)
   *    ->tree_lock             (page_remove_rmap->set_page_dirty)
-  *    inode_wb_list_lock      (page_remove_rmap->set_page_dirty)
+  *    bdi.wb->list_lock               (page_remove_rmap->set_page_dirty)
   *    ->inode->i_lock         (page_remove_rmap->set_page_dirty)
-  *    inode_wb_list_lock      (zap_pte_range->set_page_dirty)
+  *    bdi.wb->list_lock               (zap_pte_range->set_page_dirty)
   *    ->inode->i_lock         (zap_pte_range->set_page_dirty)
   *    ->private_lock          (zap_pte_range->__set_page_dirty_buffers)
   *
@@@ -128,7 -131,6 +128,7 @@@ void __delete_from_page_cache(struct pa
  
        radix_tree_delete(&mapping->page_tree, page->index);
        page->mapping = NULL;
 +      /* Leave page->index set: truncation lookup relies upon it */
        mapping->nrpages--;
        __dec_zone_page_state(page, NR_FILE_PAGES);
        if (PageSwapBacked(page))
@@@ -484,7 -486,6 +484,7 @@@ int add_to_page_cache_locked(struct pag
                        spin_unlock_irq(&mapping->tree_lock);
                } else {
                        page->mapping = NULL;
 +                      /* Leave page->index set: truncation relies upon it */
                        spin_unlock_irq(&mapping->tree_lock);
                        mem_cgroup_uncharge_cache_page(page);
                        page_cache_release(page);
@@@ -1794,7 -1795,7 +1794,7 @@@ EXPORT_SYMBOL(generic_file_readonly_mma
  
  static struct page *__read_cache_page(struct address_space *mapping,
                                pgoff_t index,
 -                              int (*filler)(void *,struct page*),
 +                              int (*filler)(void *, struct page *),
                                void *data,
                                gfp_t gfp)
  {
@@@ -1825,7 -1826,7 +1825,7 @@@ repeat
  
  static struct page *do_read_cache_page(struct address_space *mapping,
                                pgoff_t index,
 -                              int (*filler)(void *,struct page*),
 +                              int (*filler)(void *, struct page *),
                                void *data,
                                gfp_t gfp)
  
@@@ -1865,7 -1866,7 +1865,7 @@@ out
   * @mapping:  the page's address_space
   * @index:    the page index
   * @filler:   function to perform the read
 - * @data:     destination for read data
 + * @data:     first arg to filler(data, page) function, often left as NULL
   *
   * Same as read_cache_page, but don't wait for page to become unlocked
   * after submitting it to the filler.
   */
  struct page *read_cache_page_async(struct address_space *mapping,
                                pgoff_t index,
 -                              int (*filler)(void *,struct page*),
 +                              int (*filler)(void *, struct page *),
                                void *data)
  {
        return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
@@@ -1925,7 -1926,7 +1925,7 @@@ EXPORT_SYMBOL(read_cache_page_gfp)
   * @mapping:  the page's address_space
   * @index:    the page index
   * @filler:   function to perform the read
 - * @data:     destination for read data
 + * @data:     first arg to filler(data, page) function, often left as NULL
   *
   * Read into the page cache. If a page already exists, and PageUptodate() is
   * not set, try to fill the page then wait for it to become unlocked.
   */
  struct page *read_cache_page(struct address_space *mapping,
                                pgoff_t index,
 -                              int (*filler)(void *,struct page*),
 +                              int (*filler)(void *, struct page *),
                                void *data)
  {
        return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
@@@ -1999,7 -2000,7 +1999,7 @@@ int file_remove_suid(struct file *file
                error = security_inode_killpriv(dentry);
        if (!error && killsuid)
                error = __remove_suid(dentry, killsuid);
 -      if (!error)
 +      if (!error && (inode->i_sb->s_flags & MS_NOSEC))
                inode->i_flags |= S_NOSEC;
  
        return error;
diff --combined mm/page-writeback.c
  #include <linux/pagevec.h>
  #include <trace/events/writeback.h>
  
+ /*
+  * Sleep at most 200ms at a time in balance_dirty_pages().
+  */
+ #define MAX_PAUSE             max(HZ/5, 1)
+ /*
+  * Estimate write bandwidth at 200ms intervals.
+  */
+ #define BANDWIDTH_INTERVAL    max(HZ/5, 1)
  /*
   * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
   * will look to see if it needs to force writeback or throttling.
@@@ -111,6 -121,7 +121,7 @@@ EXPORT_SYMBOL(laptop_mode)
  
  /* End of sysctl-exported parameters */
  
+ unsigned long global_dirty_limit;
  
  /*
   * Scale the writeback cache size proportional to the relative writeout speeds.
@@@ -219,6 -230,7 +230,7 @@@ int dirty_bytes_handler(struct ctl_tabl
   */
  static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
  {
+       __inc_bdi_stat(bdi, BDI_WRITTEN);
        __prop_inc_percpu_max(&vm_completions, &bdi->completions,
                              bdi->max_prop_frac);
  }
@@@ -244,13 -256,8 +256,8 @@@ void task_dirty_inc(struct task_struct 
  static void bdi_writeout_fraction(struct backing_dev_info *bdi,
                long *numerator, long *denominator)
  {
-       if (bdi_cap_writeback_dirty(bdi)) {
-               prop_fraction_percpu(&vm_completions, &bdi->completions,
+       prop_fraction_percpu(&vm_completions, &bdi->completions,
                                numerator, denominator);
-       } else {
-               *numerator = 0;
-               *denominator = 1;
-       }
  }
  
  static inline void task_dirties_fraction(struct task_struct *tsk,
   * effectively curb the growth of dirty pages. Light dirtiers with high enough
   * dirty threshold may never get throttled.
   */
+ #define TASK_LIMIT_FRACTION 8
  static unsigned long task_dirty_limit(struct task_struct *tsk,
                                       unsigned long bdi_dirty)
  {
        long numerator, denominator;
        unsigned long dirty = bdi_dirty;
-       u64 inv = dirty >> 3;
+       u64 inv = dirty / TASK_LIMIT_FRACTION;
  
        task_dirties_fraction(tsk, &numerator, &denominator);
        inv *= numerator;
        return max(dirty, bdi_dirty/2);
  }
  
+ /* Minimum limit for any task */
+ static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
+ {
+       return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
+ }
  /*
   *
   */
@@@ -397,6 -411,11 +411,11 @@@ unsigned long determine_dirtyable_memor
        return x + 1;   /* Ensure that we never return 0 */
  }
  
+ static unsigned long hard_dirty_limit(unsigned long thresh)
+ {
+       return max(thresh, global_dirty_limit);
+ }
  /*
   * global_dirty_limits - background-writeback and dirty-throttling thresholds
   *
@@@ -435,12 -454,20 +454,20 @@@ void global_dirty_limits(unsigned long 
        }
        *pbackground = background;
        *pdirty = dirty;
+       trace_global_dirty_state(background, dirty);
  }
  
- /*
+ /**
   * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+  * @bdi: the backing_dev_info to query
+  * @dirty: global dirty limit in pages
   *
-  * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+  * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+  * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+  * And the "limit" in the name is not seriously taken as hard limit in
+  * balance_dirty_pages().
+  *
+  * It allocates high/low dirty limits to fast/slow devices, in order to prevent
   * - starving fast devices
   * - piling up dirty pages (that will take long time to sync) on slow devices
   *
@@@ -468,6 -495,153 +495,153 @@@ unsigned long bdi_dirty_limit(struct ba
        return bdi_dirty;
  }
  
+ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+                                      unsigned long elapsed,
+                                      unsigned long written)
+ {
+       const unsigned long period = roundup_pow_of_two(3 * HZ);
+       unsigned long avg = bdi->avg_write_bandwidth;
+       unsigned long old = bdi->write_bandwidth;
+       u64 bw;
+       /*
+        * bw = written * HZ / elapsed
+        *
+        *                   bw * elapsed + write_bandwidth * (period - elapsed)
+        * write_bandwidth = ---------------------------------------------------
+        *                                          period
+        */
+       bw = written - bdi->written_stamp;
+       bw *= HZ;
+       if (unlikely(elapsed > period)) {
+               do_div(bw, elapsed);
+               avg = bw;
+               goto out;
+       }
+       bw += (u64)bdi->write_bandwidth * (period - elapsed);
+       bw >>= ilog2(period);
+       /*
+        * one more level of smoothing, for filtering out sudden spikes
+        */
+       if (avg > old && old >= (unsigned long)bw)
+               avg -= (avg - old) >> 3;
+       if (avg < old && old <= (unsigned long)bw)
+               avg += (old - avg) >> 3;
+ out:
+       bdi->write_bandwidth = bw;
+       bdi->avg_write_bandwidth = avg;
+ }
+ /*
+  * The global dirtyable memory and dirty threshold could be suddenly knocked
+  * down by a large amount (eg. on the startup of KVM in a swapless system).
+  * This may throw the system into deep dirty exceeded state and throttle
+  * heavy/light dirtiers alike. To retain good responsiveness, maintain
+  * global_dirty_limit for tracking slowly down to the knocked down dirty
+  * threshold.
+  */
+ static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+ {
+       unsigned long limit = global_dirty_limit;
+       /*
+        * Follow up in one step.
+        */
+       if (limit < thresh) {
+               limit = thresh;
+               goto update;
+       }
+       /*
+        * Follow down slowly. Use the higher one as the target, because thresh
+        * may drop below dirty. This is exactly the reason to introduce
+        * global_dirty_limit which is guaranteed to lie above the dirty pages.
+        */
+       thresh = max(thresh, dirty);
+       if (limit > thresh) {
+               limit -= (limit - thresh) >> 5;
+               goto update;
+       }
+       return;
+ update:
+       global_dirty_limit = limit;
+ }
+ static void global_update_bandwidth(unsigned long thresh,
+                                   unsigned long dirty,
+                                   unsigned long now)
+ {
+       static DEFINE_SPINLOCK(dirty_lock);
+       static unsigned long update_time;
+       /*
+        * check locklessly first to optimize away locking for the most time
+        */
+       if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+               return;
+       spin_lock(&dirty_lock);
+       if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
+               update_dirty_limit(thresh, dirty);
+               update_time = now;
+       }
+       spin_unlock(&dirty_lock);
+ }
+ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+                           unsigned long thresh,
+                           unsigned long dirty,
+                           unsigned long bdi_thresh,
+                           unsigned long bdi_dirty,
+                           unsigned long start_time)
+ {
+       unsigned long now = jiffies;
+       unsigned long elapsed = now - bdi->bw_time_stamp;
+       unsigned long written;
+       /*
+        * rate-limit, only update once every 200ms.
+        */
+       if (elapsed < BANDWIDTH_INTERVAL)
+               return;
+       written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+       /*
+        * Skip quiet periods when disk bandwidth is under-utilized.
+        * (at least 1s idle time between two flusher runs)
+        */
+       if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+               goto snapshot;
+       if (thresh)
+               global_update_bandwidth(thresh, dirty, now);
+       bdi_update_write_bandwidth(bdi, elapsed, written);
+ snapshot:
+       bdi->written_stamp = written;
+       bdi->bw_time_stamp = now;
+ }
+ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+                                unsigned long thresh,
+                                unsigned long dirty,
+                                unsigned long bdi_thresh,
+                                unsigned long bdi_dirty,
+                                unsigned long start_time)
+ {
+       if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+               return;
+       spin_lock(&bdi->wb.list_lock);
+       __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
+                              start_time);
+       spin_unlock(&bdi->wb.list_lock);
+ }
  /*
   * balance_dirty_pages() must be called by processes which are generating dirty
   * data.  It looks at the number of dirty pages in the machine and will force
  static void balance_dirty_pages(struct address_space *mapping,
                                unsigned long write_chunk)
  {
-       long nr_reclaimable, bdi_nr_reclaimable;
-       long nr_writeback, bdi_nr_writeback;
+       unsigned long nr_reclaimable, bdi_nr_reclaimable;
+       unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
+       unsigned long bdi_dirty;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        unsigned long bdi_thresh;
+       unsigned long task_bdi_thresh;
+       unsigned long min_task_bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long pause = 1;
        bool dirty_exceeded = false;
+       bool clear_dirty_exceeded = true;
        struct backing_dev_info *bdi = mapping->backing_dev_info;
+       unsigned long start_time = jiffies;
  
        for (;;) {
-               struct writeback_control wbc = {
-                       .sync_mode      = WB_SYNC_NONE,
-                       .older_than_this = NULL,
-                       .nr_to_write    = write_chunk,
-                       .range_cyclic   = 1,
-               };
                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
                                        global_page_state(NR_UNSTABLE_NFS);
-               nr_writeback = global_page_state(NR_WRITEBACK);
+               nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
  
                global_dirty_limits(&background_thresh, &dirty_thresh);
  
                 * catch-up. This avoids (excessively) small writeouts
                 * when the bdi limits are ramping up.
                 */
-               if (nr_reclaimable + nr_writeback <=
-                               (background_thresh + dirty_thresh) / 2)
+               if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
                        break;
  
                bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-               bdi_thresh = task_dirty_limit(current, bdi_thresh);
+               min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
+               task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
  
                /*
                 * In order to avoid the stacked BDI deadlock we need
                 * actually dirty; with m+n sitting in the percpu
                 * deltas.
                 */
-               if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+               if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
                        bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-                       bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+                       bdi_dirty = bdi_nr_reclaimable +
+                                   bdi_stat_sum(bdi, BDI_WRITEBACK);
                } else {
                        bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-                       bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+                       bdi_dirty = bdi_nr_reclaimable +
+                                   bdi_stat(bdi, BDI_WRITEBACK);
                }
  
                /*
                 * bdi or process from holding back light ones; The latter is
                 * the last resort safeguard.
                 */
-               dirty_exceeded =
-                       (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
-                       || (nr_reclaimable + nr_writeback > dirty_thresh);
+               dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
+                                 (nr_dirty > dirty_thresh);
+               clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
+                                       (nr_dirty <= dirty_thresh);
  
                if (!dirty_exceeded)
                        break;
                if (!bdi->dirty_exceeded)
                        bdi->dirty_exceeded = 1;
  
+               bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
+                                    bdi_thresh, bdi_dirty, start_time);
                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
                 * Unstable writes are a feature of certain networked
                 * filesystems (i.e. NFS) in which data may have been
                 * threshold otherwise wait until the disk writes catch
                 * up.
                 */
-               trace_wbc_balance_dirty_start(&wbc, bdi);
-               if (bdi_nr_reclaimable > bdi_thresh) {
-                       writeback_inodes_wb(&bdi->wb, &wbc);
-                       pages_written += write_chunk - wbc.nr_to_write;
-                       trace_wbc_balance_dirty_written(&wbc, bdi);
+               trace_balance_dirty_start(bdi);
+               if (bdi_nr_reclaimable > task_bdi_thresh) {
+                       pages_written += writeback_inodes_wb(&bdi->wb,
+                                                            write_chunk);
+                       trace_balance_dirty_written(bdi, pages_written);
                        if (pages_written >= write_chunk)
                                break;          /* We've done our duty */
                }
-               trace_wbc_balance_dirty_wait(&wbc, bdi);
                __set_current_state(TASK_UNINTERRUPTIBLE);
                io_schedule_timeout(pause);
+               trace_balance_dirty_wait(bdi);
+               dirty_thresh = hard_dirty_limit(dirty_thresh);
+               /*
+                * max-pause area. If dirty exceeded but still within this
+                * area, no need to sleep for more than 200ms: (a) 8 pages per
+                * 200ms is typically more than enough to curb heavy dirtiers;
+                * (b) the pause time limit makes the dirtiers more responsive.
+                */
+               if (nr_dirty < dirty_thresh +
+                              dirty_thresh / DIRTY_MAXPAUSE_AREA &&
+                   time_after(jiffies, start_time + MAX_PAUSE))
+                       break;
+               /*
+                * pass-good area. When some bdi gets blocked (eg. NFS server
+                * not responding), or write bandwidth dropped dramatically due
+                * to concurrent reads, or dirty threshold suddenly dropped and
+                * the dirty pages cannot be brought down anytime soon (eg. on
+                * slow USB stick), at least let go of the good bdi's.
+                */
+               if (nr_dirty < dirty_thresh +
+                              dirty_thresh / DIRTY_PASSGOOD_AREA &&
+                   bdi_dirty < bdi_thresh)
+                       break;
  
                /*
                 * Increase the delay for each loop, up to our previous
                        pause = HZ / 10;
        }
  
-       if (!dirty_exceeded && bdi->dirty_exceeded)
+       /* Clear dirty_exceeded flag only when no task can exceed the limit */
+       if (clear_dirty_exceeded && bdi->dirty_exceeded)
                bdi->dirty_exceeded = 0;
  
        if (writeback_in_progress(bdi))
@@@ -626,9 -828,13 +828,13 @@@ static DEFINE_PER_CPU(unsigned long, bd
  void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
                                        unsigned long nr_pages_dirtied)
  {
+       struct backing_dev_info *bdi = mapping->backing_dev_info;
        unsigned long ratelimit;
        unsigned long *p;
  
+       if (!bdi_cap_account_dirty(bdi))
+               return;
        ratelimit = ratelimit_pages;
        if (mapping->backing_dev_info->dirty_exceeded)
                ratelimit = 8;
@@@ -892,12 -1098,12 +1098,12 @@@ int write_cache_pages(struct address_sp
                        range_whole = 1;
                cycled = 1; /* ignore range_cyclic tests */
        }
-       if (wbc->sync_mode == WB_SYNC_ALL)
+       if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;
  retry:
-       if (wbc->sync_mode == WB_SYNC_ALL)
+       if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, index, end);
        done_index = index;
        while (!done && (index <= end)) {
@@@ -1141,6 -1347,7 +1347,6 @@@ EXPORT_SYMBOL(account_page_dirtied)
  void account_page_writeback(struct page *page)
  {
        inc_zone_page_state(page, NR_WRITEBACK);
 -      inc_zone_page_state(page, NR_WRITTEN);
  }
  EXPORT_SYMBOL(account_page_writeback);
  
@@@ -1357,10 -1564,8 +1563,10 @@@ int test_clear_page_writeback(struct pa
        } else {
                ret = TestClearPageWriteback(page);
        }
 -      if (ret)
 +      if (ret) {
                dec_zone_page_state(page, NR_WRITEBACK);
 +              inc_zone_page_state(page, NR_WRITTEN);
 +      }
        return ret;
  }
  
@@@ -1406,6 -1611,10 +1612,6 @@@ EXPORT_SYMBOL(test_set_page_writeback)
   */
  int mapping_tagged(struct address_space *mapping, int tag)
  {
 -      int ret;
 -      rcu_read_lock();
 -      ret = radix_tree_tagged(&mapping->page_tree, tag);
 -      rcu_read_unlock();
 -      return ret;
 +      return radix_tree_tagged(&mapping->page_tree, tag);
  }
  EXPORT_SYMBOL(mapping_tagged);
diff --combined mm/rmap.c
+++ b/mm/rmap.c
@@@ -21,6 -21,7 +21,6 @@@
   * Lock ordering in mm:
   *
   * inode->i_mutex     (while writing or truncating, not reading or faulting)
 - *   inode->i_alloc_sem (vmtruncate_range)
   *   mm->mmap_sem
   *     page->flags PG_locked (lock_page)
   *       mapping->i_mmap_mutex
   *               mmlist_lock (in mmput, drain_mmlist and others)
   *               mapping->private_lock (in __set_page_dirty_buffers)
   *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
-  *               inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
+  *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
   *                 sb_lock (within inode_lock in fs/fs-writeback.c)
   *                 mapping->tree_lock (widely used, in set_page_dirty,
   *                           in arch-dependent flush_dcache_mmap_lock,
-  *                           within inode_wb_list_lock in __sync_single_inode)
+  *                           within bdi.wb->list_lock in __sync_single_inode)
   *
 - * (code doesn't rely on that order so it could be switched around)
 - * ->tasklist_lock
 - *   anon_vma->mutex      (memory_failure, collect_procs_anon)
 + * anon_vma->mutex,mapping->i_mutex      (memory_failure, collect_procs_anon)
 + *   ->tasklist_lock
   *     pte map lock
   */
  
@@@ -110,9 -112,9 +110,9 @@@ static inline void anon_vma_free(struc
        kmem_cache_free(anon_vma_cachep, anon_vma);
  }
  
 -static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
 +static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
  {
 -      return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
 +      return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
  }
  
  static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
@@@ -157,7 -159,7 +157,7 @@@ int anon_vma_prepare(struct vm_area_str
                struct mm_struct *mm = vma->vm_mm;
                struct anon_vma *allocated;
  
 -              avc = anon_vma_chain_alloc();
 +              avc = anon_vma_chain_alloc(GFP_KERNEL);
                if (!avc)
                        goto out_enomem;
  
        return -ENOMEM;
  }
  
 +/*
 + * This is a useful helper function for locking the anon_vma root as
 + * we traverse the vma->anon_vma_chain, looping over anon_vma's that
 + * have the same vma.
 + *
 + * Such anon_vma's should have the same root, so you'd expect to see
 + * just a single mutex_lock for the whole traversal.
 + */
 +static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
 +{
 +      struct anon_vma *new_root = anon_vma->root;
 +      if (new_root != root) {
 +              if (WARN_ON_ONCE(root))
 +                      mutex_unlock(&root->mutex);
 +              root = new_root;
 +              mutex_lock(&root->mutex);
 +      }
 +      return root;
 +}
 +
 +static inline void unlock_anon_vma_root(struct anon_vma *root)
 +{
 +      if (root)
 +              mutex_unlock(&root->mutex);
 +}
 +
  static void anon_vma_chain_link(struct vm_area_struct *vma,
                                struct anon_vma_chain *avc,
                                struct anon_vma *anon_vma)
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
  
 -      anon_vma_lock(anon_vma);
        /*
         * It's critical to add new vmas to the tail of the anon_vma,
         * see comment in huge_memory.c:__split_huge_page().
         */
        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
 -      anon_vma_unlock(anon_vma);
  }
  
  /*
  int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
  {
        struct anon_vma_chain *avc, *pavc;
 +      struct anon_vma *root = NULL;
  
        list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
 -              avc = anon_vma_chain_alloc();
 -              if (!avc)
 -                      goto enomem_failure;
 -              anon_vma_chain_link(dst, avc, pavc->anon_vma);
 +              struct anon_vma *anon_vma;
 +
 +              avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
 +              if (unlikely(!avc)) {
 +                      unlock_anon_vma_root(root);
 +                      root = NULL;
 +                      avc = anon_vma_chain_alloc(GFP_KERNEL);
 +                      if (!avc)
 +                              goto enomem_failure;
 +              }
 +              anon_vma = pavc->anon_vma;
 +              root = lock_anon_vma_root(root, anon_vma);
 +              anon_vma_chain_link(dst, avc, anon_vma);
        }
 +      unlock_anon_vma_root(root);
        return 0;
  
   enomem_failure:
@@@ -296,7 -263,7 +296,7 @@@ int anon_vma_fork(struct vm_area_struc
        anon_vma = anon_vma_alloc();
        if (!anon_vma)
                goto out_error;
 -      avc = anon_vma_chain_alloc();
 +      avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_error_free_anon_vma;
  
        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
 +      anon_vma_lock(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
 +      anon_vma_unlock(anon_vma);
  
        return 0;
  
        return -ENOMEM;
  }
  
 -static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
 -{
 -      struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
 -      int empty;
 -
 -      /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
 -      if (!anon_vma)
 -              return;
 -
 -      anon_vma_lock(anon_vma);
 -      list_del(&anon_vma_chain->same_anon_vma);
 -
 -      /* We must garbage collect the anon_vma if it's empty */
 -      empty = list_empty(&anon_vma->head);
 -      anon_vma_unlock(anon_vma);
 -
 -      if (empty)
 -              put_anon_vma(anon_vma);
 -}
 -
  void unlink_anon_vmas(struct vm_area_struct *vma)
  {
        struct anon_vma_chain *avc, *next;
 +      struct anon_vma *root = NULL;
  
        /*
         * Unlink each anon_vma chained to the VMA.  This list is ordered
         * from newest to oldest, ensuring the root anon_vma gets freed last.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
 -              anon_vma_unlink(avc);
 +              struct anon_vma *anon_vma = avc->anon_vma;
 +
 +              root = lock_anon_vma_root(root, anon_vma);
 +              list_del(&avc->same_anon_vma);
 +
 +              /*
 +               * Leave empty anon_vmas on the list - we'll need
 +               * to free them outside the lock.
 +               */
 +              if (list_empty(&anon_vma->head))
 +                      continue;
 +
 +              list_del(&avc->same_vma);
 +              anon_vma_chain_free(avc);
 +      }
 +      unlock_anon_vma_root(root);
 +
 +      /*
 +       * Iterate the list once more, it now only contains empty and unlinked
 +       * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
 +       * needing to acquire the anon_vma->root->mutex.
 +       */
 +      list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
 +              struct anon_vma *anon_vma = avc->anon_vma;
 +
 +              put_anon_vma(anon_vma);
 +
                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
@@@ -869,11 -827,11 +869,11 @@@ int page_referenced(struct page *page
                                                                vm_flags);
                if (we_locked)
                        unlock_page(page);
 +
 +              if (page_test_and_clear_young(page_to_pfn(page)))
 +                      referenced++;
        }
  out:
 -      if (page_test_and_clear_young(page_to_pfn(page)))
 -              referenced++;
 -
        return referenced;
  }