From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 26 Jul 2011 17:39:54 +0000 (-0700)
Subject: Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback
X-Git-Tag: v3.1-rc1~228
X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?p=pandora-kernel.git;a=commitdiff_plain;h=f01ef569cddb1a8627b1c6b3a134998ad1cf4b22;hp=-c

Merge branch 'for-linus' of git://git./linux/kernel/git/wfg/writeback

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback: (27 commits)
  mm: properly reflect task dirty limits in dirty_exceeded logic
  writeback: don't busy retry writeback on new/freeing inodes
  writeback: scale IO chunk size up to half device bandwidth
  writeback: trace global_dirty_state
  writeback: introduce max-pause and pass-good dirty limits
  writeback: introduce smoothed global dirty limit
  writeback: consolidate variable names in balance_dirty_pages()
  writeback: show bdi write bandwidth in debugfs
  writeback: bdi write bandwidth estimation
  writeback: account per-bdi accumulated written pages
  writeback: make writeback_control.nr_to_write straight
  writeback: skip tmpfs early in balance_dirty_pages_ratelimited_nr()
  writeback: trace event writeback_queue_io
  writeback: trace event writeback_single_inode
  writeback: remove .nonblocking and .encountered_congestion
  writeback: remove writeback_control.more_io
  writeback: skip balance_dirty_pages() for in-memory fs
  writeback: add bdi_dirty_limit() kernel-doc
  writeback: avoid extra sync work at enqueue time
  writeback: elevate queue_io() into wb_writeback()
  ...

Fix up trivial conflicts in fs/fs-writeback.c and mm/filemap.c
---

f01ef569cddb1a8627b1c6b3a134998ad1cf4b22
diff --combined fs/block_dev.c
index c62fb84944d5,3c9a03e51b62..f55aad4d1611
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@@ -44,24 -44,28 +44,28 @@@ inline struct block_device *I_BDEV(stru
  {
  	return &BDEV_I(inode)->bdev;
  }
- 
  EXPORT_SYMBOL(I_BDEV);
  
  /*
-  * move the inode from it's current bdi to the a new bdi. if the inode is dirty
-  * we need to move it onto the dirty list of @dst so that the inode is always
-  * on the right list.
+  * Move the inode from its current bdi to a new bdi. If the inode is dirty we
+  * need to move it onto the dirty list of @dst so that the inode is always on
+  * the right list.
   */
  static void bdev_inode_switch_bdi(struct inode *inode,
  			struct backing_dev_info *dst)
  {
- 	spin_lock(&inode_wb_list_lock);
+ 	struct backing_dev_info *old = inode->i_data.backing_dev_info;
+ 
+ 	if (unlikely(dst == old))		/* deadlock avoidance */
+ 		return;
+ 	bdi_lock_two(&old->wb, &dst->wb);
  	spin_lock(&inode->i_lock);
  	inode->i_data.backing_dev_info = dst;
  	if (inode->i_state & I_DIRTY)
  		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
  	spin_unlock(&inode->i_lock);
- 	spin_unlock(&inode_wb_list_lock);
+ 	spin_unlock(&old->wb.list_lock);
+ 	spin_unlock(&dst->wb.list_lock);
  }
  
  static sector_t max_block(struct block_device *bdev)
@@@ -355,30 -359,25 +359,30 @@@ static loff_t block_llseek(struct file 
  	mutex_lock(&bd_inode->i_mutex);
  	size = i_size_read(bd_inode);
  
 +	retval = -EINVAL;
  	switch (origin) {
 -		case 2:
 +		case SEEK_END:
  			offset += size;
  			break;
 -		case 1:
 +		case SEEK_CUR:
  			offset += file->f_pos;
 +		case SEEK_SET:
 +			break;
 +		default:
 +			goto out;
  	}
 -	retval = -EINVAL;
  	if (offset >= 0 && offset <= size) {
  		if (offset != file->f_pos) {
  			file->f_pos = offset;
  		}
  		retval = offset;
  	}
 +out:
  	mutex_unlock(&bd_inode->i_mutex);
  	return retval;
  }
  	
 -int blkdev_fsync(struct file *filp, int datasync)
 +int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
  {
  	struct inode *bd_inode = filp->f_mapping->host;
  	struct block_device *bdev = I_BDEV(bd_inode);
@@@ -389,10 -388,14 +393,10 @@@
  	 * i_mutex and doing so causes performance issues with concurrent
  	 * O_SYNC writers to a block device.
  	 */
 -	mutex_unlock(&bd_inode->i_mutex);
 -
  	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
  	if (error == -EOPNOTSUPP)
  		error = 0;
  
 -	mutex_lock(&bd_inode->i_mutex);
 -
  	return error;
  }
  EXPORT_SYMBOL(blkdev_fsync);
@@@ -763,19 -766,7 +767,19 @@@ static struct block_device *bd_start_cl
  	if (!disk)
  		return ERR_PTR(-ENXIO);
  
 -	whole = bdget_disk(disk, 0);
 +	/*
 +	 * Normally, @bdev should equal what's returned from bdget_disk()
 +	 * if partno is 0; however, some drivers (floppy) use multiple
 +	 * bdev's for the same physical device and @bdev may be one of the
 +	 * aliases.  Keep @bdev if partno is 0.  This means claimer
 +	 * tracking is broken for those devices but it has always been that
 +	 * way.
 +	 */
 +	if (partno)
 +		whole = bdget_disk(disk, 0);
 +	else
 +		whole = bdgrab(bdev);
 +
  	module_put(disk->fops->owner);
  	put_disk(disk);
  	if (!whole)
@@@ -1448,8 -1439,6 +1452,8 @@@ static int __blkdev_put(struct block_de
  
  int blkdev_put(struct block_device *bdev, fmode_t mode)
  {
 +	mutex_lock(&bdev->bd_mutex);
 +
  	if (mode & FMODE_EXCL) {
  		bool bdev_free;
  
@@@ -1458,6 -1447,7 +1462,6 @@@
  		 * are protected with bdev_lock.  bd_mutex is to
  		 * synchronize disk_holder unlinking.
  		 */
 -		mutex_lock(&bdev->bd_mutex);
  		spin_lock(&bdev_lock);
  
  		WARN_ON_ONCE(--bdev->bd_holders < 0);
@@@ -1475,21 -1465,17 +1479,21 @@@
  		 * If this was the last claim, remove holder link and
  		 * unblock evpoll if it was a write holder.
  		 */
 -		if (bdev_free) {
 -			if (bdev->bd_write_holder) {
 -				disk_unblock_events(bdev->bd_disk);
 -				disk_check_events(bdev->bd_disk);
 -				bdev->bd_write_holder = false;
 -			}
 +		if (bdev_free && bdev->bd_write_holder) {
 +			disk_unblock_events(bdev->bd_disk);
 +			bdev->bd_write_holder = false;
  		}
 -
 -		mutex_unlock(&bdev->bd_mutex);
  	}
  
 +	/*
 +	 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
 +	 * event.  This is to ensure detection of media removal commanded
 +	 * from userland - e.g. eject(1).
 +	 */
 +	disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
 +
 +	mutex_unlock(&bdev->bd_mutex);
 +
  	return __blkdev_put(bdev, mode, 0);
  }
  EXPORT_SYMBOL(blkdev_put);
diff --combined fs/ext4/inode.c
index 678cde834f19,8558b6c3450a..3e5191f9f398
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@@ -2634,7 -2634,7 +2634,7 @@@ static int ext4_writepage(struct page *
  	struct buffer_head *page_bufs = NULL;
  	struct inode *inode = page->mapping->host;
  
 -	trace_ext4_writepage(inode, page);
 +	trace_ext4_writepage(page);
  	size = i_size_read(inode);
  	if (page->index == size >> PAGE_CACHE_SHIFT)
  		len = size & ~PAGE_CACHE_MASK;
@@@ -2741,7 -2741,7 +2741,7 @@@ static int write_cache_pages_da(struct 
  	index = wbc->range_start >> PAGE_CACHE_SHIFT;
  	end = wbc->range_end >> PAGE_CACHE_SHIFT;
  
- 	if (wbc->sync_mode == WB_SYNC_ALL)
+ 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
  		tag = PAGECACHE_TAG_TOWRITE;
  	else
  		tag = PAGECACHE_TAG_DIRTY;
@@@ -2973,7 -2973,7 +2973,7 @@@ static int ext4_da_writepages(struct ad
  	}
  
  retry:
- 	if (wbc->sync_mode == WB_SYNC_ALL)
+ 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
  		tag_pages_for_writeback(mapping, index, end);
  
  	while (!ret && wbc->nr_to_write > 0) {
@@@ -3501,8 -3501,10 +3501,8 @@@ retry
  				 offset, nr_segs,
  				 ext4_get_block, NULL, NULL, 0);
  	else {
 -		ret = blockdev_direct_IO(rw, iocb, inode,
 -				 inode->i_sb->s_bdev, iov,
 -				 offset, nr_segs,
 -				 ext4_get_block, NULL);
 +		ret = blockdev_direct_IO(rw, iocb, inode, iov,
 +				 offset, nr_segs, ext4_get_block);
  
  		if (unlikely((rw & WRITE) && ret < 0)) {
  			loff_t isize = i_size_read(inode);
@@@ -3573,7 -3575,6 +3573,7 @@@ static void ext4_end_io_dio(struct kioc
  			    ssize_t size, void *private, int ret,
  			    bool is_async)
  {
 +	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
          ext4_io_end_t *io_end = iocb->private;
  	struct workqueue_struct *wq;
  	unsigned long flags;
@@@ -3595,7 -3596,6 +3595,7 @@@
  out:
  		if (is_async)
  			aio_complete(iocb, ret, 0);
 +		inode_dio_done(inode);
  		return;
  	}
  
@@@ -3616,9 -3616,6 +3616,9 @@@
  	/* queue the work to convert unwritten extents to written */
  	queue_work(wq, &io_end->work);
  	iocb->private = NULL;
 +
 +	/* XXX: probably should move into the real I/O completion handler */
 +	inode_dio_done(inode);
  }
  
  static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
@@@ -3751,13 -3748,11 +3751,13 @@@ static ssize_t ext4_ext_direct_IO(int r
  			EXT4_I(inode)->cur_aio_dio = iocb->private;
  		}
  
 -		ret = blockdev_direct_IO(rw, iocb, inode,
 +		ret = __blockdev_direct_IO(rw, iocb, inode,
  					 inode->i_sb->s_bdev, iov,
  					 offset, nr_segs,
  					 ext4_get_block_write,
 -					 ext4_end_io_dio);
 +					 ext4_end_io_dio,
 +					 NULL,
 +					 DIO_LOCKING | DIO_SKIP_HOLES);
  		if (iocb->private)
  			EXT4_I(inode)->cur_aio_dio = NULL;
  		/*
@@@ -5356,8 -5351,6 +5356,8 @@@ int ext4_setattr(struct dentry *dentry
  	}
  
  	if (attr->ia_valid & ATTR_SIZE) {
 +		inode_dio_wait(inode);
 +
  		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
  			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  
@@@ -5850,84 -5843,80 +5850,84 @@@ int ext4_page_mkwrite(struct vm_area_st
  	struct page *page = vmf->page;
  	loff_t size;
  	unsigned long len;
 -	int ret = -EINVAL;
 -	void *fsdata;
 +	int ret;
  	struct file *file = vma->vm_file;
  	struct inode *inode = file->f_path.dentry->d_inode;
  	struct address_space *mapping = inode->i_mapping;
 +	handle_t *handle;
 +	get_block_t *get_block;
 +	int retries = 0;
  
  	/*
 -	 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
 -	 * get i_mutex because we are already holding mmap_sem.
 +	 * This check is racy but catches the common case. We rely on
 +	 * __block_page_mkwrite() to do a reliable check.
  	 */
 -	down_read(&inode->i_alloc_sem);
 -	size = i_size_read(inode);
 -	if (page->mapping != mapping || size <= page_offset(page)
 -	    || !PageUptodate(page)) {
 -		/* page got truncated from under us? */
 -		goto out_unlock;
 +	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 +	/* Delalloc case is easy... */
 +	if (test_opt(inode->i_sb, DELALLOC) &&
 +	    !ext4_should_journal_data(inode) &&
 +	    !ext4_nonda_switch(inode->i_sb)) {
 +		do {
 +			ret = __block_page_mkwrite(vma, vmf,
 +						   ext4_da_get_block_prep);
 +		} while (ret == -ENOSPC &&
 +		       ext4_should_retry_alloc(inode->i_sb, &retries));
 +		goto out_ret;
  	}
 -	ret = 0;
  
  	lock_page(page);
 -	wait_on_page_writeback(page);
 -	if (PageMappedToDisk(page)) {
 -		up_read(&inode->i_alloc_sem);
 -		return VM_FAULT_LOCKED;
 +	size = i_size_read(inode);
 +	/* Page got truncated from under us? */
 +	if (page->mapping != mapping || page_offset(page) > size) {
 +		unlock_page(page);
 +		ret = VM_FAULT_NOPAGE;
 +		goto out;
  	}
  
  	if (page->index == size >> PAGE_CACHE_SHIFT)
  		len = size & ~PAGE_CACHE_MASK;
  	else
  		len = PAGE_CACHE_SIZE;
 -
  	/*
 -	 * return if we have all the buffers mapped. This avoid
 -	 * the need to call write_begin/write_end which does a
 -	 * journal_start/journal_stop which can block and take
 -	 * long time
 +	 * Return if we have all the buffers mapped. This avoids the need to do
 +	 * journal_start/journal_stop which can block and take a long time
  	 */
  	if (page_has_buffers(page)) {
  		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
  					ext4_bh_unmapped)) {
 -			up_read(&inode->i_alloc_sem);
 -			return VM_FAULT_LOCKED;
 +			/* Wait so that we don't change page under IO */
 +			wait_on_page_writeback(page);
 +			ret = VM_FAULT_LOCKED;
 +			goto out;
  		}
  	}
  	unlock_page(page);
 -	/*
 -	 * OK, we need to fill the hole... Do write_begin write_end
 -	 * to do block allocation/reservation.We are not holding
 -	 * inode.i__mutex here. That allow * parallel write_begin,
 -	 * write_end call. lock_page prevent this from happening
 -	 * on the same page though
 -	 */
 -	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
 -			len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 -	if (ret < 0)
 -		goto out_unlock;
 -	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
 -			len, len, page, fsdata);
 -	if (ret < 0)
 -		goto out_unlock;
 -	ret = 0;
 -
 -	/*
 -	 * write_begin/end might have created a dirty page and someone
 -	 * could wander in and start the IO.  Make sure that hasn't
 -	 * happened.
 -	 */
 -	lock_page(page);
 -	wait_on_page_writeback(page);
 -	up_read(&inode->i_alloc_sem);
 -	return VM_FAULT_LOCKED;
 -out_unlock:
 -	if (ret)
 +	/* OK, we need to fill the hole... */
 +	if (ext4_should_dioread_nolock(inode))
 +		get_block = ext4_get_block_write;
 +	else
 +		get_block = ext4_get_block;
 +retry_alloc:
 +	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 +	if (IS_ERR(handle)) {
  		ret = VM_FAULT_SIGBUS;
 -	up_read(&inode->i_alloc_sem);
 +		goto out;
 +	}
 +	ret = __block_page_mkwrite(vma, vmf, get_block);
 +	if (!ret && ext4_should_journal_data(inode)) {
 +		if (walk_page_buffers(handle, page_buffers(page), 0,
 +			  PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
 +			unlock_page(page);
 +			ret = VM_FAULT_SIGBUS;
 +			goto out;
 +		}
 +		ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 +	}
 +	ext4_journal_stop(handle);
 +	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 +		goto retry_alloc;
 +out_ret:
 +	ret = block_page_mkwrite_return(ret);
 +out:
  	return ret;
  }
diff --combined fs/fs-writeback.c
index b8c507ca42f7,6d49439ca31d..1599aa985fe2
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@@ -35,7 -35,9 +35,9 @@@
  struct wb_writeback_work {
  	long nr_pages;
  	struct super_block *sb;
+ 	unsigned long *older_than_this;
  	enum writeback_sync_modes sync_mode;
+ 	unsigned int tagged_writepages:1;
  	unsigned int for_kupdate:1;
  	unsigned int range_cyclic:1;
  	unsigned int for_background:1;
@@@ -180,12 -182,13 +182,13 @@@ void bdi_start_background_writeback(str
   */
  void inode_wb_list_del(struct inode *inode)
  {
- 	spin_lock(&inode_wb_list_lock);
+ 	struct backing_dev_info *bdi = inode_to_bdi(inode);
+ 
+ 	spin_lock(&bdi->wb.list_lock);
  	list_del_init(&inode->i_wb_list);
- 	spin_unlock(&inode_wb_list_lock);
+ 	spin_unlock(&bdi->wb.list_lock);
  }
  
- 
  /*
   * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
   * furthest end of its superblock's dirty-inode list.
@@@ -195,11 -198,9 +198,9 @@@
   * the case then the inode must have been redirtied while it was being written
   * out and we don't reset its dirtied_when.
   */
- static void redirty_tail(struct inode *inode)
+ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
  {
- 	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
- 
- 	assert_spin_locked(&inode_wb_list_lock);
+ 	assert_spin_locked(&wb->list_lock);
  	if (!list_empty(&wb->b_dirty)) {
  		struct inode *tail;
  
@@@ -213,11 -214,9 +214,9 @@@
  /*
   * requeue inode for re-scanning after bdi->b_io list is exhausted.
   */
- static void requeue_io(struct inode *inode)
+ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
  {
- 	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
- 
- 	assert_spin_locked(&inode_wb_list_lock);
+ 	assert_spin_locked(&wb->list_lock);
  	list_move(&inode->i_wb_list, &wb->b_more_io);
  }
  
@@@ -225,7 -224,7 +224,7 @@@ static void inode_sync_complete(struct 
  {
  	/*
  	 * Prevent speculative execution through
- 	 * spin_unlock(&inode_wb_list_lock);
+ 	 * spin_unlock(&wb->list_lock);
  	 */
  
  	smp_mb();
@@@ -250,15 -249,16 +249,16 @@@ static bool inode_dirtied_after(struct 
  /*
   * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
   */
- static void move_expired_inodes(struct list_head *delaying_queue,
+ static int move_expired_inodes(struct list_head *delaying_queue,
  			       struct list_head *dispatch_queue,
- 				unsigned long *older_than_this)
+ 			       unsigned long *older_than_this)
  {
  	LIST_HEAD(tmp);
  	struct list_head *pos, *node;
  	struct super_block *sb = NULL;
  	struct inode *inode;
  	int do_sb_sort = 0;
+ 	int moved = 0;
  
  	while (!list_empty(delaying_queue)) {
  		inode = wb_inode(delaying_queue->prev);
@@@ -269,12 -269,13 +269,13 @@@
  			do_sb_sort = 1;
  		sb = inode->i_sb;
  		list_move(&inode->i_wb_list, &tmp);
+ 		moved++;
  	}
  
  	/* just one sb in list, splice to dispatch_queue and we're done */
  	if (!do_sb_sort) {
  		list_splice(&tmp, dispatch_queue);
- 		return;
+ 		goto out;
  	}
  
  	/* Move inodes from one superblock together */
@@@ -286,6 -287,8 +287,8 @@@
  				list_move(&inode->i_wb_list, dispatch_queue);
  		}
  	}
+ out:
+ 	return moved;
  }
  
  /*
@@@ -301,9 -304,11 +304,11 @@@
   */
  static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
  {
- 	assert_spin_locked(&inode_wb_list_lock);
+ 	int moved;
+ 	assert_spin_locked(&wb->list_lock);
  	list_splice_init(&wb->b_more_io, &wb->b_io);
- 	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+ 	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+ 	trace_writeback_queue_io(wb, older_than_this, moved);
  }
  
  static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@@ -316,7 -321,8 +321,8 @@@
  /*
   * Wait for writeback on an inode to complete.
   */
- static void inode_wait_for_writeback(struct inode *inode)
+ static void inode_wait_for_writeback(struct inode *inode,
+ 				     struct bdi_writeback *wb)
  {
  	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
  	wait_queue_head_t *wqh;
@@@ -324,15 -330,15 +330,15 @@@
  	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
  	while (inode->i_state & I_SYNC) {
  		spin_unlock(&inode->i_lock);
- 		spin_unlock(&inode_wb_list_lock);
+ 		spin_unlock(&wb->list_lock);
  		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
- 		spin_lock(&inode_wb_list_lock);
+ 		spin_lock(&wb->list_lock);
  		spin_lock(&inode->i_lock);
  	}
  }
  
  /*
-  * Write out an inode's dirty pages.  Called under inode_wb_list_lock and
+  * Write out an inode's dirty pages.  Called under wb->list_lock and
   * inode->i_lock.  Either the caller has an active reference on the inode or
   * the inode has I_WILL_FREE set.
   *
@@@ -343,13 -349,15 +349,15 @@@
   * livelocks, etc.
   */
  static int
- writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
+ 		       struct writeback_control *wbc)
  {
  	struct address_space *mapping = inode->i_mapping;
+ 	long nr_to_write = wbc->nr_to_write;
  	unsigned dirty;
  	int ret;
  
- 	assert_spin_locked(&inode_wb_list_lock);
+ 	assert_spin_locked(&wb->list_lock);
  	assert_spin_locked(&inode->i_lock);
  
  	if (!atomic_read(&inode->i_count))
@@@ -367,14 -375,16 +375,16 @@@
  		 * completed a full scan of b_io.
  		 */
  		if (wbc->sync_mode != WB_SYNC_ALL) {
- 			requeue_io(inode);
+ 			requeue_io(inode, wb);
+ 			trace_writeback_single_inode_requeue(inode, wbc,
+ 							     nr_to_write);
  			return 0;
  		}
  
  		/*
  		 * It's a data-integrity sync.  We must wait.
  		 */
- 		inode_wait_for_writeback(inode);
+ 		inode_wait_for_writeback(inode, wb);
  	}
  
  	BUG_ON(inode->i_state & I_SYNC);
@@@ -383,7 -393,7 +393,7 @@@
  	inode->i_state |= I_SYNC;
  	inode->i_state &= ~I_DIRTY_PAGES;
  	spin_unlock(&inode->i_lock);
- 	spin_unlock(&inode_wb_list_lock);
+ 	spin_unlock(&wb->list_lock);
  
  	ret = do_writepages(mapping, wbc);
  
@@@ -414,10 -424,19 +424,19 @@@
  			ret = err;
  	}
  
- 	spin_lock(&inode_wb_list_lock);
+ 	spin_lock(&wb->list_lock);
  	spin_lock(&inode->i_lock);
  	inode->i_state &= ~I_SYNC;
  	if (!(inode->i_state & I_FREEING)) {
+ 		/*
+ 		 * Sync livelock prevention. Each inode is tagged and synced in
+ 		 * one shot. If still dirty, it will be redirty_tail()'ed below.
+ 		 * Update the dirty time to prevent enqueue and sync it again.
+ 		 */
+ 		if ((inode->i_state & I_DIRTY) &&
+ 		    (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
+ 			inode->dirtied_when = jiffies;
+ 
  		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
  			/*
  			 * We didn't write back all the pages.  nfs_writepages()
@@@ -428,7 -447,7 +447,7 @@@
  				/*
  				 * slice used up: queue for next turn
  				 */
- 				requeue_io(inode);
+ 				requeue_io(inode, wb);
  			} else {
  				/*
  				 * Writeback blocked by something other than
@@@ -437,7 -456,7 +456,7 @@@
  				 * retrying writeback of the dirty page/inode
  				 * that cannot be performed immediately.
  				 */
- 				redirty_tail(inode);
+ 				redirty_tail(inode, wb);
  			}
  		} else if (inode->i_state & I_DIRTY) {
  			/*
@@@ -446,7 -465,7 +465,7 @@@
  			 * submission or metadata updates after data IO
  			 * completion.
  			 */
- 			redirty_tail(inode);
+ 			redirty_tail(inode, wb);
  		} else {
  			/*
  			 * The inode is clean.  At this point we either have
@@@ -457,9 -476,67 +476,41 @@@
  		}
  	}
  	inode_sync_complete(inode);
+ 	trace_writeback_single_inode(inode, wbc, nr_to_write);
  	return ret;
  }
  
 -/*
 - * For background writeback the caller does not have the sb pinned
 - * before calling writeback. So make sure that we do pin it, so it doesn't
 - * go away while we are writing inodes from it.
 - */
 -static bool pin_sb_for_writeback(struct super_block *sb)
 -{
 -	spin_lock(&sb_lock);
 -	if (list_empty(&sb->s_instances)) {
 -		spin_unlock(&sb_lock);
 -		return false;
 -	}
 -
 -	sb->s_count++;
 -	spin_unlock(&sb_lock);
 -
 -	if (down_read_trylock(&sb->s_umount)) {
 -		if (sb->s_root)
 -			return true;
 -		up_read(&sb->s_umount);
 -	}
 -
 -	put_super(sb);
 -	return false;
 -}
 -
+ static long writeback_chunk_size(struct backing_dev_info *bdi,
+ 				 struct wb_writeback_work *work)
+ {
+ 	long pages;
+ 
+ 	/*
+ 	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+ 	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+ 	 * here avoids calling into writeback_inodes_wb() more than once.
+ 	 *
+ 	 * The intended call sequence for WB_SYNC_ALL writeback is:
+ 	 *
+ 	 *      wb_writeback()
+ 	 *          writeback_sb_inodes()       <== called only once
+ 	 *              write_cache_pages()     <== called once for each inode
+ 	 *                   (quickly) tag currently dirty pages
+ 	 *                   (maybe slowly) sync all tagged pages
+ 	 */
+ 	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
+ 		pages = LONG_MAX;
+ 	else {
+ 		pages = min(bdi->avg_write_bandwidth / 2,
+ 			    global_dirty_limit / DIRTY_SCOPE);
+ 		pages = min(pages, work->nr_pages);
+ 		pages = round_down(pages + MIN_WRITEBACK_PAGES,
+ 				   MIN_WRITEBACK_PAGES);
+ 	}
+ 
+ 	return pages;
+ }
+ 
  /*
   * Write a portion of b_io inodes which belong to @sb.
   *
@@@ -467,24 -544,36 +518,36 @@@
   * inodes. Otherwise write only ones which go sequentially
   * in reverse order.
   *
-  * Return 1, if the caller writeback routine should be
-  * interrupted. Otherwise return 0.
+  * Return the number of pages and/or inodes written.
   */
- static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
- 		struct writeback_control *wbc, bool only_this_sb)
+ static long writeback_sb_inodes(struct super_block *sb,
+ 				struct bdi_writeback *wb,
+ 				struct wb_writeback_work *work)
  {
+ 	struct writeback_control wbc = {
+ 		.sync_mode		= work->sync_mode,
+ 		.tagged_writepages	= work->tagged_writepages,
+ 		.for_kupdate		= work->for_kupdate,
+ 		.for_background		= work->for_background,
+ 		.range_cyclic		= work->range_cyclic,
+ 		.range_start		= 0,
+ 		.range_end		= LLONG_MAX,
+ 	};
+ 	unsigned long start_time = jiffies;
+ 	long write_chunk;
+ 	long wrote = 0;  /* count both pages and inodes */
+ 
  	while (!list_empty(&wb->b_io)) {
- 		long pages_skipped;
  		struct inode *inode = wb_inode(wb->b_io.prev);
  
  		if (inode->i_sb != sb) {
- 			if (only_this_sb) {
+ 			if (work->sb) {
  				/*
  				 * We only want to write back data for this
  				 * superblock, move all inodes not belonging
  				 * to it back onto the dirty list.
  				 */
- 				redirty_tail(inode);
+ 				redirty_tail(inode, wb);
  				continue;
  			}
  
@@@ -493,7 -582,7 +556,7 @@@
  			 * Bounce back to the caller to unpin this and
  			 * pin the next superblock.
  			 */
- 			return 0;
+ 			break;
  		}
  
  		/*
@@@ -504,95 -593,91 +567,91 @@@
  		spin_lock(&inode->i_lock);
  		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
  			spin_unlock(&inode->i_lock);
- 			requeue_io(inode);
+ 			redirty_tail(inode, wb);
  			continue;
  		}
- 
- 		/*
- 		 * Was this inode dirtied after sync_sb_inodes was called?
- 		 * This keeps sync from extra jobs and livelock.
- 		 */
- 		if (inode_dirtied_after(inode, wbc->wb_start)) {
- 			spin_unlock(&inode->i_lock);
- 			return 1;
- 		}
- 
  		__iget(inode);
+ 		write_chunk = writeback_chunk_size(wb->bdi, work);
+ 		wbc.nr_to_write = write_chunk;
+ 		wbc.pages_skipped = 0;
  
- 		pages_skipped = wbc->pages_skipped;
- 		writeback_single_inode(inode, wbc);
- 		if (wbc->pages_skipped != pages_skipped) {
+ 		writeback_single_inode(inode, wb, &wbc);
+ 
+ 		work->nr_pages -= write_chunk - wbc.nr_to_write;
+ 		wrote += write_chunk - wbc.nr_to_write;
+ 		if (!(inode->i_state & I_DIRTY))
+ 			wrote++;
+ 		if (wbc.pages_skipped) {
  			/*
  			 * writeback is not making progress due to locked
  			 * buffers.  Skip this inode for now.
  			 */
- 			redirty_tail(inode);
+ 			redirty_tail(inode, wb);
  		}
  		spin_unlock(&inode->i_lock);
- 		spin_unlock(&inode_wb_list_lock);
+ 		spin_unlock(&wb->list_lock);
  		iput(inode);
  		cond_resched();
- 		spin_lock(&inode_wb_list_lock);
- 		if (wbc->nr_to_write <= 0) {
- 			wbc->more_io = 1;
- 			return 1;
+ 		spin_lock(&wb->list_lock);
+ 		/*
+ 		 * bail out to wb_writeback() often enough to check
+ 		 * background threshold and other termination conditions.
+ 		 */
+ 		if (wrote) {
+ 			if (time_is_before_jiffies(start_time + HZ / 10UL))
+ 				break;
+ 			if (work->nr_pages <= 0)
+ 				break;
  		}
- 		if (!list_empty(&wb->b_more_io))
- 			wbc->more_io = 1;
  	}
- 	/* b_io is empty */
- 	return 1;
+ 	return wrote;
  }
  
- void writeback_inodes_wb(struct bdi_writeback *wb,
- 		struct writeback_control *wbc)
+ static long __writeback_inodes_wb(struct bdi_writeback *wb,
+ 				  struct wb_writeback_work *work)
  {
- 	int ret = 0;
- 
- 	if (!wbc->wb_start)
- 		wbc->wb_start = jiffies; /* livelock avoidance */
- 	spin_lock(&inode_wb_list_lock);
- 	if (!wbc->for_kupdate || list_empty(&wb->b_io))
- 		queue_io(wb, wbc->older_than_this);
+ 	unsigned long start_time = jiffies;
+ 	long wrote = 0;
  
  	while (!list_empty(&wb->b_io)) {
  		struct inode *inode = wb_inode(wb->b_io.prev);
  		struct super_block *sb = inode->i_sb;
  
 -		if (!pin_sb_for_writeback(sb)) {
 +		if (!grab_super_passive(sb)) {
- 			requeue_io(inode);
+ 			requeue_io(inode, wb);
  			continue;
  		}
- 		ret = writeback_sb_inodes(sb, wb, wbc, false);
+ 		wrote += writeback_sb_inodes(sb, wb, work);
  		drop_super(sb);
  
- 		if (ret)
- 			break;
+ 		/* refer to the same tests at the end of writeback_sb_inodes */
+ 		if (wrote) {
+ 			if (time_is_before_jiffies(start_time + HZ / 10UL))
+ 				break;
+ 			if (work->nr_pages <= 0)
+ 				break;
+ 		}
  	}
- 	spin_unlock(&inode_wb_list_lock);
  	/* Leave any unwritten inodes on b_io */
+ 	return wrote;
  }
  
- static void __writeback_inodes_sb(struct super_block *sb,
- 		struct bdi_writeback *wb, struct writeback_control *wbc)
+ long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
  {
- 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+ 	struct wb_writeback_work work = {
+ 		.nr_pages	= nr_pages,
+ 		.sync_mode	= WB_SYNC_NONE,
+ 		.range_cyclic	= 1,
+ 	};
  
- 	spin_lock(&inode_wb_list_lock);
- 	if (!wbc->for_kupdate || list_empty(&wb->b_io))
- 		queue_io(wb, wbc->older_than_this);
- 	writeback_sb_inodes(sb, wb, wbc, true);
- 	spin_unlock(&inode_wb_list_lock);
- }
+ 	spin_lock(&wb->list_lock);
+ 	if (list_empty(&wb->b_io))
+ 		queue_io(wb, NULL);
+ 	__writeback_inodes_wb(wb, &work);
+ 	spin_unlock(&wb->list_lock);
  
- /*
-  * The maximum number of pages to writeout in a single bdi flush/kupdate
-  * operation.  We do this so we don't hold I_SYNC against an inode for
-  * enormous amounts of time, which would block a userspace task which has
-  * been forced to throttle against that inode.  Also, the code reevaluates
-  * the dirty each time it has written this many pages.
-  */
- #define MAX_WRITEBACK_PAGES     1024
+ 	return nr_pages - work.nr_pages;
+ }
  
  static inline bool over_bground_thresh(void)
  {
@@@ -604,6 -689,16 +663,16 @@@
  		global_page_state(NR_UNSTABLE_NFS) > background_thresh);
  }
  
+ /*
+  * Called under wb->list_lock. If there are multiple wb per bdi,
+  * only the flusher working on the first wb should do it.
+  */
+ static void wb_update_bandwidth(struct bdi_writeback *wb,
+ 				unsigned long start_time)
+ {
+ 	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
+ }
+ 
  /*
   * Explicit flushing or periodic writeback of "old" data.
   *
@@@ -622,47 -717,16 +691,16 @@@
  static long wb_writeback(struct bdi_writeback *wb,
  			 struct wb_writeback_work *work)
  {
- 	struct writeback_control wbc = {
- 		.sync_mode		= work->sync_mode,
- 		.older_than_this	= NULL,
- 		.for_kupdate		= work->for_kupdate,
- 		.for_background		= work->for_background,
- 		.range_cyclic		= work->range_cyclic,
- 	};
+ 	unsigned long wb_start = jiffies;
+ 	long nr_pages = work->nr_pages;
  	unsigned long oldest_jif;
- 	long wrote = 0;
- 	long write_chunk;
  	struct inode *inode;
+ 	long progress;
  
- 	if (wbc.for_kupdate) {
- 		wbc.older_than_this = &oldest_jif;
- 		oldest_jif = jiffies -
- 				msecs_to_jiffies(dirty_expire_interval * 10);
- 	}
- 	if (!wbc.range_cyclic) {
- 		wbc.range_start = 0;
- 		wbc.range_end = LLONG_MAX;
- 	}
+ 	oldest_jif = jiffies;
+ 	work->older_than_this = &oldest_jif;
  
- 	/*
- 	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
- 	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
- 	 * here avoids calling into writeback_inodes_wb() more than once.
- 	 *
- 	 * The intended call sequence for WB_SYNC_ALL writeback is:
- 	 *
- 	 *      wb_writeback()
- 	 *          __writeback_inodes_sb()     <== called only once
- 	 *              write_cache_pages()     <== called once for each inode
- 	 *                   (quickly) tag currently dirty pages
- 	 *                   (maybe slowly) sync all tagged pages
- 	 */
- 	if (wbc.sync_mode == WB_SYNC_NONE)
- 		write_chunk = MAX_WRITEBACK_PAGES;
- 	else
- 		write_chunk = LONG_MAX;
- 
- 	wbc.wb_start = jiffies; /* livelock avoidance */
+ 	spin_lock(&wb->list_lock);
  	for (;;) {
  		/*
  		 * Stop writeback when nr_pages has been consumed
@@@ -687,52 -751,54 +725,54 @@@
  		if (work->for_background && !over_bground_thresh())
  			break;
  
- 		wbc.more_io = 0;
- 		wbc.nr_to_write = write_chunk;
- 		wbc.pages_skipped = 0;
+ 		if (work->for_kupdate) {
+ 			oldest_jif = jiffies -
+ 				msecs_to_jiffies(dirty_expire_interval * 10);
+ 			work->older_than_this = &oldest_jif;
+ 		}
  
- 		trace_wbc_writeback_start(&wbc, wb->bdi);
+ 		trace_writeback_start(wb->bdi, work);
+ 		if (list_empty(&wb->b_io))
+ 			queue_io(wb, work->older_than_this);
  		if (work->sb)
- 			__writeback_inodes_sb(work->sb, wb, &wbc);
+ 			progress = writeback_sb_inodes(work->sb, wb, work);
  		else
- 			writeback_inodes_wb(wb, &wbc);
- 		trace_wbc_writeback_written(&wbc, wb->bdi);
+ 			progress = __writeback_inodes_wb(wb, work);
+ 		trace_writeback_written(wb->bdi, work);
  
- 		work->nr_pages -= write_chunk - wbc.nr_to_write;
- 		wrote += write_chunk - wbc.nr_to_write;
+ 		wb_update_bandwidth(wb, wb_start);
  
  		/*
- 		 * If we consumed everything, see if we have more
+ 		 * Did we write something? Try for more
+ 		 *
+ 		 * Dirty inodes are moved to b_io for writeback in batches.
+ 		 * The completion of the current batch does not necessarily
+ 		 * mean the overall work is done. So we keep looping as long
+ 		 * as made some progress on cleaning pages or inodes.
  		 */
- 		if (wbc.nr_to_write <= 0)
+ 		if (progress)
  			continue;
  		/*
- 		 * Didn't write everything and we don't have more IO, bail
+ 		 * No more inodes for IO, bail
  		 */
- 		if (!wbc.more_io)
+ 		if (list_empty(&wb->b_more_io))
  			break;
- 		/*
- 		 * Did we write something? Try for more
- 		 */
- 		if (wbc.nr_to_write < write_chunk)
- 			continue;
  		/*
  		 * Nothing written. Wait for some inode to
  		 * become available for writeback. Otherwise
  		 * we'll just busyloop.
  		 */
- 		spin_lock(&inode_wb_list_lock);
  		if (!list_empty(&wb->b_more_io))  {
+ 			trace_writeback_wait(wb->bdi, work);
  			inode = wb_inode(wb->b_more_io.prev);
- 			trace_wbc_writeback_wait(&wbc, wb->bdi);
  			spin_lock(&inode->i_lock);
- 			inode_wait_for_writeback(inode);
+ 			inode_wait_for_writeback(inode, wb);
  			spin_unlock(&inode->i_lock);
  		}
- 		spin_unlock(&inode_wb_list_lock);
  	}
+ 	spin_unlock(&wb->list_lock);
  
- 	return wrote;
+ 	return nr_pages - work->nr_pages;
  }
  
  /*
@@@ -1063,10 -1129,10 +1103,10 @@@ void __mark_inode_dirty(struct inode *i
  			}
  
  			spin_unlock(&inode->i_lock);
- 			spin_lock(&inode_wb_list_lock);
+ 			spin_lock(&bdi->wb.list_lock);
  			inode->dirtied_when = jiffies;
  			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
- 			spin_unlock(&inode_wb_list_lock);
+ 			spin_unlock(&bdi->wb.list_lock);
  
  			if (wakeup_bdi)
  				bdi_wakeup_thread_delayed(bdi);
@@@ -1162,10 -1228,11 +1202,11 @@@ void writeback_inodes_sb_nr(struct supe
  {
  	DECLARE_COMPLETION_ONSTACK(done);
  	struct wb_writeback_work work = {
- 		.sb		= sb,
- 		.sync_mode	= WB_SYNC_NONE,
- 		.done		= &done,
- 		.nr_pages	= nr,
+ 		.sb			= sb,
+ 		.sync_mode		= WB_SYNC_NONE,
+ 		.tagged_writepages	= 1,
+ 		.done			= &done,
+ 		.nr_pages		= nr,
  	};
  
  	WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@@ -1267,6 -1334,7 +1308,7 @@@ EXPORT_SYMBOL(sync_inodes_sb)
   */
  int write_inode_now(struct inode *inode, int sync)
  {
+ 	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
  	int ret;
  	struct writeback_control wbc = {
  		.nr_to_write = LONG_MAX,
@@@ -1279,11 -1347,11 +1321,11 @@@
  		wbc.nr_to_write = 0;
  
  	might_sleep();
- 	spin_lock(&inode_wb_list_lock);
+ 	spin_lock(&wb->list_lock);
  	spin_lock(&inode->i_lock);
- 	ret = writeback_single_inode(inode, &wbc);
+ 	ret = writeback_single_inode(inode, wb, &wbc);
  	spin_unlock(&inode->i_lock);
- 	spin_unlock(&inode_wb_list_lock);
+ 	spin_unlock(&wb->list_lock);
  	if (sync)
  		inode_sync_wait(inode);
  	return ret;
@@@ -1303,13 -1371,14 +1345,14 @@@ EXPORT_SYMBOL(write_inode_now)
   */
  int sync_inode(struct inode *inode, struct writeback_control *wbc)
  {
+ 	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
  	int ret;
  
- 	spin_lock(&inode_wb_list_lock);
+ 	spin_lock(&wb->list_lock);
  	spin_lock(&inode->i_lock);
- 	ret = writeback_single_inode(inode, wbc);
+ 	ret = writeback_single_inode(inode, wb, wbc);
  	spin_unlock(&inode->i_lock);
- 	spin_unlock(&inode_wb_list_lock);
+ 	spin_unlock(&wb->list_lock);
  	return ret;
  }
  EXPORT_SYMBOL(sync_inode);
diff --combined fs/inode.c
index 96c77b81167c,4be128cbc754..a48fa5355fb4
--- a/fs/inode.c
+++ b/fs/inode.c
@@@ -33,11 -33,11 +33,11 @@@
   *
   * inode->i_lock protects:
   *   inode->i_state, inode->i_hash, __iget()
 - * inode_lru_lock protects:
 - *   inode_lru, inode->i_lru
 + * inode->i_sb->s_inode_lru_lock protects:
 + *   inode->i_sb->s_inode_lru, inode->i_lru
   * inode_sb_list_lock protects:
   *   sb->s_inodes, inode->i_sb_list
-  * inode_wb_list_lock protects:
+  * bdi->wb.list_lock protects:
   *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
   * inode_hash_lock protects:
   *   inode_hashtable, inode->i_hash
@@@ -46,9 -46,9 +46,9 @@@
   *
   * inode_sb_list_lock
   *   inode->i_lock
 - *     inode_lru_lock
 + *     inode->i_sb->s_inode_lru_lock
   *
-  * inode_wb_list_lock
+  * bdi->wb.list_lock
   *   inode->i_lock
   *
   * inode_hash_lock
@@@ -64,9 -64,22 +64,8 @@@ static unsigned int i_hash_shift __read
  static struct hlist_head *inode_hashtable __read_mostly;
  static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
  
 -static LIST_HEAD(inode_lru);
 -static DEFINE_SPINLOCK(inode_lru_lock);
 -
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
  
 -/*
 - * iprune_sem provides exclusion between the icache shrinking and the
 - * umount path.
 - *
 - * We don't actually need it to protect anything in the umount path,
 - * but only need to cycle through it to make sure any inode that
 - * prune_icache took off the LRU list has been fully torn down by the
 - * time we are past evict_inodes.
 - */
 -static DECLARE_RWSEM(iprune_sem);
 -
  /*
   * Empty aops. Can be used for the cases where the user does not
   * define any of the address_space operations.
@@@ -81,7 -94,6 +80,7 @@@ EXPORT_SYMBOL(empty_aops)
  struct inodes_stat_t inodes_stat;
  
  static DEFINE_PER_CPU(unsigned int, nr_inodes);
 +static DEFINE_PER_CPU(unsigned int, nr_unused);
  
  static struct kmem_cache *inode_cachep __read_mostly;
  
@@@ -96,11 -108,7 +95,11 @@@ static int get_nr_inodes(void
  
  static inline int get_nr_inodes_unused(void)
  {
 -	return inodes_stat.nr_unused;
 +	int i;
 +	int sum = 0;
 +	for_each_possible_cpu(i)
 +		sum += per_cpu(nr_unused, i);
 +	return sum < 0 ? 0 : sum;
  }
  
  int get_nr_dirty_inodes(void)
@@@ -118,7 -126,6 +117,7 @@@ int proc_nr_inodes(ctl_table *table, in
  		   void __user *buffer, size_t *lenp, loff_t *ppos)
  {
  	inodes_stat.nr_inodes = get_nr_inodes();
 +	inodes_stat.nr_unused = get_nr_inodes_unused();
  	return proc_dointvec(table, write, buffer, lenp, ppos);
  }
  #endif
@@@ -168,7 -175,8 +167,7 @@@ int inode_init_always(struct super_bloc
  	mutex_init(&inode->i_mutex);
  	lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
  
 -	init_rwsem(&inode->i_alloc_sem);
 -	lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
 +	atomic_set(&inode->i_dio_count, 0);
  
  	mapping->a_ops = &empty_aops;
  	mapping->host = inode;
@@@ -328,24 -336,22 +327,24 @@@ EXPORT_SYMBOL(ihold)
  
  static void inode_lru_list_add(struct inode *inode)
  {
 -	spin_lock(&inode_lru_lock);
 +	spin_lock(&inode->i_sb->s_inode_lru_lock);
  	if (list_empty(&inode->i_lru)) {
 -		list_add(&inode->i_lru, &inode_lru);
 -		inodes_stat.nr_unused++;
 +		list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
 +		inode->i_sb->s_nr_inodes_unused++;
 +		this_cpu_inc(nr_unused);
  	}
 -	spin_unlock(&inode_lru_lock);
 +	spin_unlock(&inode->i_sb->s_inode_lru_lock);
  }
  
  static void inode_lru_list_del(struct inode *inode)
  {
 -	spin_lock(&inode_lru_lock);
 +	spin_lock(&inode->i_sb->s_inode_lru_lock);
  	if (!list_empty(&inode->i_lru)) {
  		list_del_init(&inode->i_lru);
 -		inodes_stat.nr_unused--;
 +		inode->i_sb->s_nr_inodes_unused--;
 +		this_cpu_dec(nr_unused);
  	}
 -	spin_unlock(&inode_lru_lock);
 +	spin_unlock(&inode->i_sb->s_inode_lru_lock);
  }
  
  /**
@@@ -416,14 -422,7 +415,14 @@@ EXPORT_SYMBOL(remove_inode_hash)
  void end_writeback(struct inode *inode)
  {
  	might_sleep();
 +	/*
 +	 * We have to cycle tree_lock here because reclaim can be still in the
 +	 * process of removing the last page (in __delete_from_page_cache())
 +	 * and we must not free mapping under it.
 +	 */
 +	spin_lock_irq(&inode->i_data.tree_lock);
  	BUG_ON(inode->i_data.nrpages);
 +	spin_unlock_irq(&inode->i_data.tree_lock);
  	BUG_ON(!list_empty(&inode->i_data.private_list));
  	BUG_ON(!(inode->i_state & I_FREEING));
  	BUG_ON(inode->i_state & I_CLEAR);
@@@ -530,6 -529,14 +529,6 @@@ void evict_inodes(struct super_block *s
  	spin_unlock(&inode_sb_list_lock);
  
  	dispose_list(&dispose);
 -
 -	/*
 -	 * Cycle through iprune_sem to make sure any inode that prune_icache
 -	 * moved off the list before we took the lock has been fully torn
 -	 * down.
 -	 */
 -	down_write(&iprune_sem);
 -	up_write(&iprune_sem);
  }
  
  /**
@@@ -592,10 -599,8 +591,10 @@@ static int can_unuse(struct inode *inod
  }
  
  /*
 - * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
 - * temporary list and then are freed outside inode_lru_lock by dispose_list().
 + * Walk the superblock inode LRU for freeable inodes and attempt to free them.
 + * This is called from the superblock shrinker function with a number of inodes
 + * to trim from the LRU. Inodes to be freed are moved to a temporary list and
 + * then are freed outside inode_lock by dispose_list().
   *
   * Any inodes which are pinned purely because of attached pagecache have their
   * pagecache removed.  If the inode has metadata buffers attached to
@@@ -609,28 -614,29 +608,28 @@@
   * LRU does not have strict ordering. Hence we don't want to reclaim inodes
   * with this flag set because they are the inodes that are out of order.
   */
 -static void prune_icache(int nr_to_scan)
 +void prune_icache_sb(struct super_block *sb, int nr_to_scan)
  {
  	LIST_HEAD(freeable);
  	int nr_scanned;
  	unsigned long reap = 0;
  
 -	down_read(&iprune_sem);
 -	spin_lock(&inode_lru_lock);
 -	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 +	spin_lock(&sb->s_inode_lru_lock);
 +	for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
  		struct inode *inode;
  
 -		if (list_empty(&inode_lru))
 +		if (list_empty(&sb->s_inode_lru))
  			break;
  
 -		inode = list_entry(inode_lru.prev, struct inode, i_lru);
 +		inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
  
  		/*
 -		 * we are inverting the inode_lru_lock/inode->i_lock here,
 +		 * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
  		 * so use a trylock. If we fail to get the lock, just move the
  		 * inode to the back of the list so we don't spin on it.
  		 */
  		if (!spin_trylock(&inode->i_lock)) {
 -			list_move(&inode->i_lru, &inode_lru);
 +			list_move(&inode->i_lru, &sb->s_inode_lru);
  			continue;
  		}
  
@@@ -642,29 -648,28 +641,29 @@@
  		    (inode->i_state & ~I_REFERENCED)) {
  			list_del_init(&inode->i_lru);
  			spin_unlock(&inode->i_lock);
 -			inodes_stat.nr_unused--;
 +			sb->s_nr_inodes_unused--;
 +			this_cpu_dec(nr_unused);
  			continue;
  		}
  
  		/* recently referenced inodes get one more pass */
  		if (inode->i_state & I_REFERENCED) {
  			inode->i_state &= ~I_REFERENCED;
 -			list_move(&inode->i_lru, &inode_lru);
 +			list_move(&inode->i_lru, &sb->s_inode_lru);
  			spin_unlock(&inode->i_lock);
  			continue;
  		}
  		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
  			__iget(inode);
  			spin_unlock(&inode->i_lock);
 -			spin_unlock(&inode_lru_lock);
 +			spin_unlock(&sb->s_inode_lru_lock);
  			if (remove_inode_buffers(inode))
  				reap += invalidate_mapping_pages(&inode->i_data,
  								0, -1);
  			iput(inode);
 -			spin_lock(&inode_lru_lock);
 +			spin_lock(&sb->s_inode_lru_lock);
  
 -			if (inode != list_entry(inode_lru.next,
 +			if (inode != list_entry(sb->s_inode_lru.next,
  						struct inode, i_lru))
  				continue;	/* wrong inode or list_empty */
  			/* avoid lock inversions with trylock */
@@@ -680,18 -685,51 +679,18 @@@
  		spin_unlock(&inode->i_lock);
  
  		list_move(&inode->i_lru, &freeable);
 -		inodes_stat.nr_unused--;
 +		sb->s_nr_inodes_unused--;
 +		this_cpu_dec(nr_unused);
  	}
  	if (current_is_kswapd())
  		__count_vm_events(KSWAPD_INODESTEAL, reap);
  	else
  		__count_vm_events(PGINODESTEAL, reap);
 -	spin_unlock(&inode_lru_lock);
 +	spin_unlock(&sb->s_inode_lru_lock);
  
  	dispose_list(&freeable);
 -	up_read(&iprune_sem);
  }
  
 -/*
 - * shrink_icache_memory() will attempt to reclaim some unused inodes.  Here,
 - * "unused" means that no dentries are referring to the inodes: the files are
 - * not open and the dcache references to those inodes have already been
 - * reclaimed.
 - *
 - * This function is passed the number of inodes to scan, and it returns the
 - * total number of remaining possibly-reclaimable inodes.
 - */
 -static int shrink_icache_memory(struct shrinker *shrink,
 -				struct shrink_control *sc)
 -{
 -	int nr = sc->nr_to_scan;
 -	gfp_t gfp_mask = sc->gfp_mask;
 -
 -	if (nr) {
 -		/*
 -		 * Nasty deadlock avoidance.  We may hold various FS locks,
 -		 * and we don't want to recurse into the FS that called us
 -		 * in clear_inode() and friends..
 -		 */
 -		if (!(gfp_mask & __GFP_FS))
 -			return -1;
 -		prune_icache(nr);
 -	}
 -	return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
 -}
 -
 -static struct shrinker icache_shrinker = {
 -	.shrink = shrink_icache_memory,
 -	.seeks = DEFAULT_SEEKS,
 -};
 -
  static void __wait_on_freeing_inode(struct inode *inode);
  /*
   * Called with the inode lock held.
@@@ -1285,7 -1323,7 +1284,7 @@@ static void iput_final(struct inode *in
  
  	WARN_ON(inode->i_state & I_NEW);
  
 -	if (op && op->drop_inode)
 +	if (op->drop_inode)
  		drop = op->drop_inode(inode);
  	else
  		drop = generic_drop_inode(inode);
@@@ -1571,6 -1609,7 +1570,6 @@@ void __init inode_init(void
  					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
  					 SLAB_MEM_SPREAD),
  					 init_once);
 -	register_shrinker(&icache_shrinker);
  
  	/* Hash may have been set up in inode_init_early */
  	if (!hashdist)
diff --combined fs/nfs/write.c
index 08579312c57b,dd6a6cee39a7..00e37501fa3b
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@@ -409,7 -409,7 +409,7 @@@ out
   */
  static void nfs_inode_remove_request(struct nfs_page *req)
  {
 -	struct inode *inode = req->wb_context->path.dentry->d_inode;
 +	struct inode *inode = req->wb_context->dentry->d_inode;
  	struct nfs_inode *nfsi = NFS_I(inode);
  
  	BUG_ON (!NFS_WBACK_BUSY(req));
@@@ -438,7 -438,7 +438,7 @@@ nfs_mark_request_dirty(struct nfs_page 
  static void
  nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
  {
 -	struct inode *inode = req->wb_context->path.dentry->d_inode;
 +	struct inode *inode = req->wb_context->dentry->d_inode;
  	struct nfs_inode *nfsi = NFS_I(inode);
  
  	spin_lock(&inode->i_lock);
@@@ -852,20 -852,18 +852,20 @@@ static int nfs_write_rpcsetup(struct nf
  		struct pnfs_layout_segment *lseg,
  		int how)
  {
 -	struct inode *inode = req->wb_context->path.dentry->d_inode;
 +	struct inode *inode = req->wb_context->dentry->d_inode;
  
  	/* Set up the RPC argument and reply structs
  	 * NB: take care not to mess about with data->commit et al. */
  
  	data->req = req;
 -	data->inode = inode = req->wb_context->path.dentry->d_inode;
 +	data->inode = inode = req->wb_context->dentry->d_inode;
  	data->cred = req->wb_context->cred;
  	data->lseg = get_lseg(lseg);
  
  	data->args.fh     = NFS_FH(inode);
  	data->args.offset = req_offset(req) + offset;
 +	/* pnfs_set_layoutcommit needs this */
 +	data->mds_offset = data->args.offset;
  	data->args.pgbase = req->wb_pgbase + offset;
  	data->args.pages  = data->pagevec;
  	data->args.count  = count;
@@@ -1053,9 -1051,9 +1053,9 @@@ static void nfs_writeback_done_partial(
  
  	dprintk("NFS: %5u write(%s/%lld %d@%lld)",
  		task->tk_pid,
 -		data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
 +		data->req->wb_context->dentry->d_inode->i_sb->s_id,
  		(long long)
 -		  NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
 +		  NFS_FILEID(data->req->wb_context->dentry->d_inode),
  		data->req->wb_bytes, (long long)req_offset(data->req));
  
  	nfs_writeback_done(task, data);
@@@ -1148,8 -1146,8 +1148,8 @@@ static void nfs_writeback_release_full(
  
  		dprintk("NFS: %5u write (%s/%lld %d@%lld)",
  			data->task.tk_pid,
 -			req->wb_context->path.dentry->d_inode->i_sb->s_id,
 -			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
 +			req->wb_context->dentry->d_inode->i_sb->s_id,
 +			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
  			req->wb_bytes,
  			(long long)req_offset(req));
  
@@@ -1347,7 -1345,7 +1347,7 @@@ void nfs_init_commit(struct nfs_write_d
  			    struct pnfs_layout_segment *lseg)
  {
  	struct nfs_page *first = nfs_list_entry(head->next);
 -	struct inode *inode = first->wb_context->path.dentry->d_inode;
 +	struct inode *inode = first->wb_context->dentry->d_inode;
  
  	/* Set up the RPC argument and reply structs
  	 * NB: take care not to mess about with data->commit et al. */
@@@ -1435,8 -1433,8 +1435,8 @@@ void nfs_commit_release_pages(struct nf
  		nfs_clear_request_commit(req);
  
  		dprintk("NFS:       commit (%s/%lld %d@%lld)",
 -			req->wb_context->path.dentry->d_inode->i_sb->s_id,
 -			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
 +			req->wb_context->dentry->d_sb->s_id,
 +			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
  			req->wb_bytes,
  			(long long)req_offset(req));
  		if (status < 0) {
@@@ -1566,8 -1564,7 +1566,7 @@@ int nfs_write_inode(struct inode *inode
  		int status;
  		bool sync = true;
  
- 		if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
- 		    wbc->for_background)
+ 		if (wbc->sync_mode == WB_SYNC_NONE)
  			sync = false;
  
  		status = pnfs_layoutcommit_inode(inode, sync);
diff --combined include/trace/events/ext4.h
index 5ce2b2f5f524,b225d0d8c87f..6363193a3418
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@@ -26,7 -26,7 +26,7 @@@ TRACE_EVENT(ext4_free_inode
  		__field(	umode_t, mode			)
  		__field(	uid_t,	uid			)
  		__field(	gid_t,	gid			)
 -		__field(	blkcnt_t, blocks		)
 +		__field(	__u64, blocks			)
  	),
  
  	TP_fast_assign(
@@@ -40,8 -40,9 +40,8 @@@
  
  	TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
 -		  (unsigned long) __entry->ino,
 -		  __entry->mode, __entry->uid, __entry->gid,
 -		  (unsigned long long) __entry->blocks)
 +		  (unsigned long) __entry->ino, __entry->mode,
 +		  __entry->uid, __entry->gid, __entry->blocks)
  );
  
  TRACE_EVENT(ext4_request_inode,
@@@ -177,7 -178,7 +177,7 @@@ TRACE_EVENT(ext4_begin_ordered_truncate
  	TP_printk("dev %d,%d ino %lu new_size %lld",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
 -		  (long long) __entry->new_size)
 +		  __entry->new_size)
  );
  
  DECLARE_EVENT_CLASS(ext4__write_begin,
@@@ -203,7 -204,7 +203,7 @@@
  		__entry->flags	= flags;
  	),
  
 -	TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u",
 +	TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
  		  __entry->pos, __entry->len, __entry->flags)
@@@ -247,7 -248,7 +247,7 @@@ DECLARE_EVENT_CLASS(ext4__write_end
  		__entry->copied	= copied;
  	),
  
 -	TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u",
 +	TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
  		  __entry->pos, __entry->len, __entry->copied)
@@@ -285,6 -286,29 +285,6 @@@ DEFINE_EVENT(ext4__write_end, ext4_da_w
  	TP_ARGS(inode, pos, len, copied)
  );
  
 -TRACE_EVENT(ext4_writepage,
 -	TP_PROTO(struct inode *inode, struct page *page),
 -
 -	TP_ARGS(inode, page),
 -
 -	TP_STRUCT__entry(
 -		__field(	dev_t,	dev			)
 -		__field(	ino_t,	ino			)
 -		__field(	pgoff_t, index			)
 -
 -	),
 -
 -	TP_fast_assign(
 -		__entry->dev	= inode->i_sb->s_dev;
 -		__entry->ino	= inode->i_ino;
 -		__entry->index	= page->index;
 -	),
 -
 -	TP_printk("dev %d,%d ino %lu page_index %lu",
 -		  MAJOR(__entry->dev), MINOR(__entry->dev),
 -		  (unsigned long) __entry->ino, __entry->index)
 -);
 -
  TRACE_EVENT(ext4_da_writepages,
  	TP_PROTO(struct inode *inode, struct writeback_control *wbc),
  
@@@ -317,7 -341,7 +317,7 @@@
  	),
  
  	TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
 -		  "range_start %llu range_end %llu sync_mode %d"
 +		  "range_start %lld range_end %lld sync_mode %d"
  		  "for_kupdate %d range_cyclic %d writeback_index %lu",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino, __entry->nr_to_write,
@@@ -380,7 -404,6 +380,6 @@@ TRACE_EVENT(ext4_da_writepages_result
  		__field(	int,	pages_written		)
  		__field(	long,	pages_skipped		)
  		__field(	int,	sync_mode		)
- 		__field(	char,	more_io			)	
  		__field(       pgoff_t,	writeback_index		)
  	),
  
@@@ -391,16 -414,15 +390,15 @@@
  		__entry->pages_written	= pages_written;
  		__entry->pages_skipped	= wbc->pages_skipped;
  		__entry->sync_mode	= wbc->sync_mode;
- 		__entry->more_io	= wbc->more_io;
  		__entry->writeback_index = inode->i_mapping->writeback_index;
  	),
  
  	TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
- 		  " more_io %d sync_mode %d writeback_index %lu",
+ 		  "sync_mode %d writeback_index %lu",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino, __entry->ret,
  		  __entry->pages_written, __entry->pages_skipped,
- 		  __entry->more_io, __entry->sync_mode,
+ 		  __entry->sync_mode,
  		  (unsigned long) __entry->writeback_index)
  );
  
@@@ -425,14 -447,7 +423,14 @@@ DECLARE_EVENT_CLASS(ext4__page_op
  	TP_printk("dev %d,%d ino %lu page_index %lu",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
 -		  __entry->index)
 +		  (unsigned long) __entry->index)
 +);
 +
 +DEFINE_EVENT(ext4__page_op, ext4_writepage,
 +
 +	TP_PROTO(struct page *page),
 +
 +	TP_ARGS(page)
  );
  
  DEFINE_EVENT(ext4__page_op, ext4_readpage,
@@@ -472,7 -487,7 +470,7 @@@ TRACE_EVENT(ext4_invalidatepage
  	TP_printk("dev %d,%d ino %lu page_index %lu offset %lu",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
 -		  __entry->index, __entry->offset)
 +		  (unsigned long) __entry->index, __entry->offset)
  );
  
  TRACE_EVENT(ext4_discard_blocks,
@@@ -545,10 -560,12 +543,10 @@@ DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_n
  );
  
  TRACE_EVENT(ext4_mb_release_inode_pa,
 -	TP_PROTO(struct super_block *sb,
 -		 struct inode *inode,
 -		 struct ext4_prealloc_space *pa,
 +	TP_PROTO(struct ext4_prealloc_space *pa,
  		 unsigned long long block, unsigned int count),
  
 -	TP_ARGS(sb, inode, pa, block, count),
 +	TP_ARGS(pa, block, count),
  
  	TP_STRUCT__entry(
  		__field(	dev_t,	dev			)
@@@ -559,8 -576,8 +557,8 @@@
  	),
  
  	TP_fast_assign(
 -		__entry->dev		= sb->s_dev;
 -		__entry->ino		= inode->i_ino;
 +		__entry->dev		= pa->pa_inode->i_sb->s_dev;
 +		__entry->ino		= pa->pa_inode->i_ino;
  		__entry->block		= block;
  		__entry->count		= count;
  	),
@@@ -572,9 -589,10 +570,9 @@@
  );
  
  TRACE_EVENT(ext4_mb_release_group_pa,
 -	TP_PROTO(struct super_block *sb,
 -		 struct ext4_prealloc_space *pa),
 +	TP_PROTO(struct ext4_prealloc_space *pa),
  
 -	TP_ARGS(sb, pa),
 +	TP_ARGS(pa),
  
  	TP_STRUCT__entry(
  		__field(	dev_t,	dev			)
@@@ -584,7 -602,7 +582,7 @@@
  	),
  
  	TP_fast_assign(
 -		__entry->dev		= sb->s_dev;
 +		__entry->dev		= pa->pa_inode->i_sb->s_dev;
  		__entry->pa_pstart	= pa->pa_pstart;
  		__entry->pa_len		= pa->pa_len;
  	),
@@@ -646,10 -664,10 +644,10 @@@ TRACE_EVENT(ext4_request_blocks
  		__field(	ino_t,	ino			)
  		__field(	unsigned int, flags		)
  		__field(	unsigned int, len		)
 -		__field(	__u64,  logical			)
 +		__field(	__u32,  logical			)
 +		__field(	__u32,	lleft			)
 +		__field(	__u32,	lright			)
  		__field(	__u64,	goal			)
 -		__field(	__u64,	lleft			)
 -		__field(	__u64,	lright			)
  		__field(	__u64,	pleft			)
  		__field(	__u64,	pright			)
  	),
@@@ -667,13 -685,17 +665,13 @@@
  		__entry->pright	= ar->pright;
  	),
  
 -	TP_printk("dev %d,%d ino %lu flags %u len %u lblk %llu goal %llu "
 -		  "lleft %llu lright %llu pleft %llu pright %llu ",
 +	TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu "
 +		  "lleft %u lright %u pleft %llu pright %llu ",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
 -		  (unsigned long) __entry->ino,
 -		  __entry->flags, __entry->len,
 -		  (unsigned long long) __entry->logical,
 -		  (unsigned long long) __entry->goal,
 -		  (unsigned long long) __entry->lleft,
 -		  (unsigned long long) __entry->lright,
 -		  (unsigned long long) __entry->pleft,
 -		  (unsigned long long) __entry->pright)
 +		  (unsigned long) __entry->ino, __entry->flags,
 +		  __entry->len, __entry->logical, __entry->goal,
 +		  __entry->lleft, __entry->lright, __entry->pleft,
 +		  __entry->pright)
  );
  
  TRACE_EVENT(ext4_allocate_blocks,
@@@ -687,10 -709,10 +685,10 @@@
  		__field(	__u64,	block			)
  		__field(	unsigned int, flags		)
  		__field(	unsigned int, len		)
 -		__field(	__u64,  logical			)
 +		__field(	__u32,  logical			)
 +		__field(	__u32,	lleft			)
 +		__field(	__u32,	lright			)
  		__field(	__u64,	goal			)
 -		__field(	__u64,	lleft			)
 -		__field(	__u64,	lright			)
  		__field(	__u64,	pleft			)
  		__field(	__u64,	pright			)
  	),
@@@ -709,13 -731,17 +707,13 @@@
  		__entry->pright	= ar->pright;
  	),
  
 -	TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %llu "
 -		  "goal %llu lleft %llu lright %llu pleft %llu pright %llu",
 +	TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u "
 +		  "goal %llu lleft %u lright %u pleft %llu pright %llu",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
 -		  (unsigned long) __entry->ino,
 -		  __entry->flags, __entry->len, __entry->block,
 -		  (unsigned long long) __entry->logical,
 -		  (unsigned long long) __entry->goal,
 -		  (unsigned long long) __entry->lleft,
 -		  (unsigned long long) __entry->lright,
 -		  (unsigned long long) __entry->pleft,
 -		  (unsigned long long) __entry->pright)
 +		  (unsigned long) __entry->ino, __entry->flags,
 +		  __entry->len, __entry->block, __entry->logical,
 +		  __entry->goal,  __entry->lleft, __entry->lright,
 +		  __entry->pleft, __entry->pright)
  );
  
  TRACE_EVENT(ext4_free_blocks,
@@@ -727,10 -753,10 +725,10 @@@
  	TP_STRUCT__entry(
  		__field(	dev_t,	dev			)
  		__field(	ino_t,	ino			)
 -		__field(      umode_t, mode			)
 +		__field(	umode_t, mode			)
  		__field(	__u64,	block			)
  		__field(	unsigned long,	count		)
 -		__field(	 int,	flags			)
 +		__field(	int,	flags			)
  	),
  
  	TP_fast_assign(
@@@ -770,7 -796,7 +768,7 @@@ TRACE_EVENT(ext4_sync_file_enter
  		__entry->parent		= dentry->d_parent->d_inode->i_ino;
  	),
  
 -	TP_printk("dev %d,%d ino %ld parent %ld datasync %d ",
 +	TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
  		  (unsigned long) __entry->parent, __entry->datasync)
@@@ -793,7 -819,7 +791,7 @@@ TRACE_EVENT(ext4_sync_file_exit
  		__entry->dev		= inode->i_sb->s_dev;
  	),
  
 -	TP_printk("dev %d,%d ino %ld ret %d",
 +	TP_printk("dev %d,%d ino %lu ret %d",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
  		  __entry->ret)
@@@ -977,7 -1003,7 +975,7 @@@ DECLARE_EVENT_CLASS(ext4__mballoc
  		__entry->result_len	= len;
  	),
  
 -	TP_printk("dev %d,%d inode %lu extent %u/%d/%u ",
 +	TP_printk("dev %d,%d inode %lu extent %u/%d/%d ",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
  		  __entry->result_group, __entry->result_start,
@@@ -1065,7 -1091,7 +1063,7 @@@ TRACE_EVENT(ext4_da_update_reserve_spac
  		  "allocated_meta_blocks %d",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
 -		  __entry->mode,  (unsigned long long) __entry->i_blocks,
 +		  __entry->mode, __entry->i_blocks,
  		  __entry->used_blocks, __entry->reserved_data_blocks,
  		  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
  );
@@@ -1099,7 -1125,7 +1097,7 @@@ TRACE_EVENT(ext4_da_reserve_space
  		  "reserved_data_blocks %d reserved_meta_blocks %d",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
 -		  __entry->mode, (unsigned long long) __entry->i_blocks,
 +		  __entry->mode, __entry->i_blocks,
  		  __entry->md_needed, __entry->reserved_data_blocks,
  		  __entry->reserved_meta_blocks)
  );
@@@ -1136,7 -1162,7 +1134,7 @@@ TRACE_EVENT(ext4_da_release_space
  		  "allocated_meta_blocks %d",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
 -		  __entry->mode, (unsigned long long) __entry->i_blocks,
 +		  __entry->mode, __entry->i_blocks,
  		  __entry->freed_blocks, __entry->reserved_data_blocks,
  		  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
  );
@@@ -1211,15 -1237,14 +1209,15 @@@ TRACE_EVENT(ext4_direct_IO_enter
  		__entry->rw	= rw;
  	),
  
 -	TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d",
 +	TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
 -		  (unsigned long long) __entry->pos, __entry->len, __entry->rw)
 +		  __entry->pos, __entry->len, __entry->rw)
  );
  
  TRACE_EVENT(ext4_direct_IO_exit,
 -	TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw, int ret),
 +	TP_PROTO(struct inode *inode, loff_t offset, unsigned long len,
 +		 int rw, int ret),
  
  	TP_ARGS(inode, offset, len, rw, ret),
  
@@@ -1241,10 -1266,10 +1239,10 @@@
  		__entry->ret	= ret;
  	),
  
 -	TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d ret %d",
 +	TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
 -		  (unsigned long long) __entry->pos, __entry->len,
 +		  __entry->pos, __entry->len,
  		  __entry->rw, __entry->ret)
  );
  
@@@ -1269,15 -1294,15 +1267,15 @@@ TRACE_EVENT(ext4_fallocate_enter
  		__entry->mode	= mode;
  	),
  
 -	TP_printk("dev %d,%d ino %ld pos %llu len %llu mode %d",
 +	TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
 -		  (unsigned long) __entry->ino,
 -		  (unsigned long long) __entry->pos,
 -		  (unsigned long long) __entry->len, __entry->mode)
 +		  (unsigned long) __entry->ino, __entry->pos,
 +		  __entry->len, __entry->mode)
  );
  
  TRACE_EVENT(ext4_fallocate_exit,
 -	TP_PROTO(struct inode *inode, loff_t offset, unsigned int max_blocks, int ret),
 +	TP_PROTO(struct inode *inode, loff_t offset,
 +		 unsigned int max_blocks, int ret),
  
  	TP_ARGS(inode, offset, max_blocks, ret),
  
@@@ -1285,7 -1310,7 +1283,7 @@@
  		__field(	ino_t,	ino			)
  		__field(	dev_t,	dev			)
  		__field(	loff_t,	pos			)
 -		__field(	unsigned,	blocks		)
 +		__field(	unsigned int,	blocks		)
  		__field(	int, 	ret			)
  	),
  
@@@ -1297,10 -1322,10 +1295,10 @@@
  		__entry->ret	= ret;
  	),
  
 -	TP_printk("dev %d,%d ino %ld pos %llu blocks %d ret %d",
 +	TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
 -		  (unsigned long long) __entry->pos, __entry->blocks,
 +		  __entry->pos, __entry->blocks,
  		  __entry->ret)
  );
  
@@@ -1323,7 -1348,7 +1321,7 @@@ TRACE_EVENT(ext4_unlink_enter
  		__entry->dev		= dentry->d_inode->i_sb->s_dev;
  	),
  
 -	TP_printk("dev %d,%d ino %ld size %lld parent %ld",
 +	TP_printk("dev %d,%d ino %lu size %lld parent %lu",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino, __entry->size,
  		  (unsigned long) __entry->parent)
@@@ -1346,7 -1371,7 +1344,7 @@@ TRACE_EVENT(ext4_unlink_exit
  		__entry->ret		= ret;
  	),
  
 -	TP_printk("dev %d,%d ino %ld ret %d",
 +	TP_printk("dev %d,%d ino %lu ret %d",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
  		  __entry->ret)
@@@ -1360,7 -1385,7 +1358,7 @@@ DECLARE_EVENT_CLASS(ext4__truncate
  	TP_STRUCT__entry(
  		__field(	ino_t,  	ino		)
  		__field(	dev_t,  	dev		)
 -		__field(	blkcnt_t,	blocks		)
 +		__field(	__u64,		blocks		)
  	),
  
  	TP_fast_assign(
@@@ -1369,9 -1394,9 +1367,9 @@@
  		__entry->blocks	= inode->i_blocks;
  	),
  
 -	TP_printk("dev %d,%d ino %lu blocks %lu",
 +	TP_printk("dev %d,%d ino %lu blocks %llu",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
 -		  (unsigned long) __entry->ino, (unsigned long) __entry->blocks)
 +		  (unsigned long) __entry->ino, __entry->blocks)
  );
  
  DEFINE_EVENT(ext4__truncate, ext4_truncate_enter,
@@@ -1390,7 -1415,7 +1388,7 @@@ DEFINE_EVENT(ext4__truncate, ext4_trunc
  
  DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
  	TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
 -		 unsigned len, unsigned flags),
 +		 unsigned int len, unsigned int flags),
  
  	TP_ARGS(inode, lblk, len, flags),
  
@@@ -1398,8 -1423,8 +1396,8 @@@
  		__field(	ino_t,  	ino		)
  		__field(	dev_t,  	dev		)
  		__field(	ext4_lblk_t,	lblk		)
 -		__field(	unsigned,	len		)
 -		__field(	unsigned,	flags		)
 +		__field(	unsigned int,	len		)
 +		__field(	unsigned int,	flags		)
  	),
  
  	TP_fast_assign(
@@@ -1413,7 -1438,7 +1411,7 @@@
  	TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
 -		  (unsigned) __entry->lblk, __entry->len, __entry->flags)
 +		  __entry->lblk, __entry->len, __entry->flags)
  );
  
  DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
@@@ -1432,7 -1457,7 +1430,7 @@@ DEFINE_EVENT(ext4__map_blocks_enter, ex
  
  DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
  	TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
 -		 ext4_fsblk_t pblk, unsigned len, int ret),
 +		 ext4_fsblk_t pblk, unsigned int len, int ret),
  
  	TP_ARGS(inode, lblk, pblk, len, ret),
  
@@@ -1441,7 -1466,7 +1439,7 @@@
  		__field(	dev_t,		dev		)
  		__field(	ext4_lblk_t,	lblk		)
  		__field(	ext4_fsblk_t,	pblk		)
 -		__field(	unsigned,	len		)
 +		__field(	unsigned int,	len		)
  		__field(	int,		ret		)
  	),
  
@@@ -1457,7 -1482,7 +1455,7 @@@
  	TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
 -		  (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
 +		  __entry->lblk, __entry->pblk,
  		  __entry->len, __entry->ret)
  );
  
@@@ -1497,7 -1522,7 +1495,7 @@@ TRACE_EVENT(ext4_ext_load_extent
  	TP_printk("dev %d,%d ino %lu lblk %u pblk %llu",
  		  MAJOR(__entry->dev), MINOR(__entry->dev),
  		  (unsigned long) __entry->ino,
 -		  (unsigned) __entry->lblk, (unsigned long long) __entry->pblk)
 +		  __entry->lblk, __entry->pblk)
  );
  
  TRACE_EVENT(ext4_load_inode,
diff --combined mm/backing-dev.c
index 8290b1e88257,ddd0345e2e6d..d6edf8d14f9c
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@@ -45,6 -45,17 +45,17 @@@ static struct timer_list sync_supers_ti
  static int bdi_sync_supers(void *);
  static void sync_supers_timer_fn(unsigned long);
  
+ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+ {
+ 	if (wb1 < wb2) {
+ 		spin_lock(&wb1->list_lock);
+ 		spin_lock_nested(&wb2->list_lock, 1);
+ 	} else {
+ 		spin_lock(&wb2->list_lock);
+ 		spin_lock_nested(&wb1->list_lock, 1);
+ 	}
+ }
+ 
  #ifdef CONFIG_DEBUG_FS
  #include <linux/debugfs.h>
  #include <linux/seq_file.h>
@@@ -67,34 -78,42 +78,42 @@@ static int bdi_debug_stats_show(struct 
  	struct inode *inode;
  
  	nr_dirty = nr_io = nr_more_io = 0;
- 	spin_lock(&inode_wb_list_lock);
+ 	spin_lock(&wb->list_lock);
  	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
  		nr_dirty++;
  	list_for_each_entry(inode, &wb->b_io, i_wb_list)
  		nr_io++;
  	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
  		nr_more_io++;
- 	spin_unlock(&inode_wb_list_lock);
+ 	spin_unlock(&wb->list_lock);
  
  	global_dirty_limits(&background_thresh, &dirty_thresh);
  	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
  
  #define K(x) ((x) << (PAGE_SHIFT - 10))
  	seq_printf(m,
- 		   "BdiWriteback:     %8lu kB\n"
- 		   "BdiReclaimable:   %8lu kB\n"
- 		   "BdiDirtyThresh:   %8lu kB\n"
- 		   "DirtyThresh:      %8lu kB\n"
- 		   "BackgroundThresh: %8lu kB\n"
- 		   "b_dirty:          %8lu\n"
- 		   "b_io:             %8lu\n"
- 		   "b_more_io:        %8lu\n"
- 		   "bdi_list:         %8u\n"
- 		   "state:            %8lx\n",
+ 		   "BdiWriteback:       %10lu kB\n"
+ 		   "BdiReclaimable:     %10lu kB\n"
+ 		   "BdiDirtyThresh:     %10lu kB\n"
+ 		   "DirtyThresh:        %10lu kB\n"
+ 		   "BackgroundThresh:   %10lu kB\n"
+ 		   "BdiWritten:         %10lu kB\n"
+ 		   "BdiWriteBandwidth:  %10lu kBps\n"
+ 		   "b_dirty:            %10lu\n"
+ 		   "b_io:               %10lu\n"
+ 		   "b_more_io:          %10lu\n"
+ 		   "bdi_list:           %10u\n"
+ 		   "state:              %10lx\n",
  		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
  		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
- 		   K(bdi_thresh), K(dirty_thresh),
- 		   K(background_thresh), nr_dirty, nr_io, nr_more_io,
+ 		   K(bdi_thresh),
+ 		   K(dirty_thresh),
+ 		   K(background_thresh),
+ 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+ 		   (unsigned long) K(bdi->write_bandwidth),
+ 		   nr_dirty,
+ 		   nr_io,
+ 		   nr_more_io,
  		   !list_empty(&bdi->bdi_list), bdi->state);
  #undef K
  
@@@ -249,18 -268,6 +268,6 @@@ int bdi_has_dirty_io(struct backing_dev
  	return wb_has_dirty_io(&bdi->wb);
  }
  
- static void bdi_flush_io(struct backing_dev_info *bdi)
- {
- 	struct writeback_control wbc = {
- 		.sync_mode		= WB_SYNC_NONE,
- 		.older_than_this	= NULL,
- 		.range_cyclic		= 1,
- 		.nr_to_write		= 1024,
- 	};
- 
- 	writeback_inodes_wb(&bdi->wb, &wbc);
- }
- 
  /*
   * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
   * or we risk deadlocking on ->s_umount. The longer term solution would be
@@@ -446,9 -453,10 +453,10 @@@ static int bdi_forker_thread(void *ptr
  			if (IS_ERR(task)) {
  				/*
  				 * If thread creation fails, force writeout of
- 				 * the bdi from the thread.
+ 				 * the bdi from the thread. Hopefully 1024 is
+ 				 * large enough for efficient IO.
  				 */
- 				bdi_flush_io(bdi);
+ 				writeback_inodes_wb(&bdi->wb, 1024);
  			} else {
  				/*
  				 * The spinlock makes sure we do not lose
@@@ -505,7 -513,7 +513,7 @@@ static void bdi_remove_from_list(struc
  	list_del_rcu(&bdi->bdi_list);
  	spin_unlock_bh(&bdi_lock);
  
 -	synchronize_rcu();
 +	synchronize_rcu_expedited();
  }
  
  int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@@ -606,7 -614,6 +614,7 @@@ static void bdi_prune_sb(struct backing
  void bdi_unregister(struct backing_dev_info *bdi)
  {
  	if (bdi->dev) {
 +		bdi_set_min_ratio(bdi, 0);
  		trace_writeback_bdi_unregister(bdi);
  		bdi_prune_sb(bdi);
  		del_timer_sync(&bdi->wb.wakeup_timer);
@@@ -629,9 -636,15 +637,15 @@@ static void bdi_wb_init(struct bdi_writ
  	INIT_LIST_HEAD(&wb->b_dirty);
  	INIT_LIST_HEAD(&wb->b_io);
  	INIT_LIST_HEAD(&wb->b_more_io);
+ 	spin_lock_init(&wb->list_lock);
  	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
  }
  
+ /*
+  * Initial write bandwidth: 100 MB/s
+  */
+ #define INIT_BW		(100 << (20 - PAGE_SHIFT))
+ 
  int bdi_init(struct backing_dev_info *bdi)
  {
  	int i, err;
@@@ -654,6 -667,13 +668,13 @@@
  	}
  
  	bdi->dirty_exceeded = 0;
+ 
+ 	bdi->bw_time_stamp = jiffies;
+ 	bdi->written_stamp = 0;
+ 
+ 	bdi->write_bandwidth = INIT_BW;
+ 	bdi->avg_write_bandwidth = INIT_BW;
+ 
  	err = prop_local_init_percpu(&bdi->completions);
  
  	if (err) {
@@@ -677,11 -697,12 +698,12 @@@ void bdi_destroy(struct backing_dev_inf
  	if (bdi_has_dirty_io(bdi)) {
  		struct bdi_writeback *dst = &default_backing_dev_info.wb;
  
- 		spin_lock(&inode_wb_list_lock);
+ 		bdi_lock_two(&bdi->wb, dst);
  		list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
  		list_splice(&bdi->wb.b_io, &dst->b_io);
  		list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- 		spin_unlock(&inode_wb_list_lock);
+ 		spin_unlock(&bdi->wb.list_lock);
+ 		spin_unlock(&dst->list_lock);
  	}
  
  	bdi_unregister(bdi);
diff --combined mm/filemap.c
index 10a171113273,1e492c3dd6f8..867d40222ec7
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@@ -78,7 -78,10 +78,7 @@@
   *  ->i_mutex			(generic_file_buffered_write)
   *    ->mmap_sem		(fault_in_pages_readable->do_page_fault)
   *
-  *  inode_wb_list_lock
 - *  ->i_mutex
 - *    ->i_alloc_sem             (various)
 - *
+  *  bdi->wb.list_lock
   *    sb_lock			(fs/fs-writeback.c)
   *    ->mapping->tree_lock	(__sync_single_inode)
   *
@@@ -96,9 -99,9 +96,9 @@@
   *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
   *    ->private_lock		(page_remove_rmap->set_page_dirty)
   *    ->tree_lock		(page_remove_rmap->set_page_dirty)
-  *    inode_wb_list_lock	(page_remove_rmap->set_page_dirty)
+  *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
   *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
-  *    inode_wb_list_lock	(zap_pte_range->set_page_dirty)
+  *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
   *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
   *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
   *
@@@ -128,7 -131,6 +128,7 @@@ void __delete_from_page_cache(struct pa
  
  	radix_tree_delete(&mapping->page_tree, page->index);
  	page->mapping = NULL;
 +	/* Leave page->index set: truncation lookup relies upon it */
  	mapping->nrpages--;
  	__dec_zone_page_state(page, NR_FILE_PAGES);
  	if (PageSwapBacked(page))
@@@ -484,7 -486,6 +484,7 @@@ int add_to_page_cache_locked(struct pag
  			spin_unlock_irq(&mapping->tree_lock);
  		} else {
  			page->mapping = NULL;
 +			/* Leave page->index set: truncation relies upon it */
  			spin_unlock_irq(&mapping->tree_lock);
  			mem_cgroup_uncharge_cache_page(page);
  			page_cache_release(page);
@@@ -1794,7 -1795,7 +1794,7 @@@ EXPORT_SYMBOL(generic_file_readonly_mma
  
  static struct page *__read_cache_page(struct address_space *mapping,
  				pgoff_t index,
 -				int (*filler)(void *,struct page*),
 +				int (*filler)(void *, struct page *),
  				void *data,
  				gfp_t gfp)
  {
@@@ -1825,7 -1826,7 +1825,7 @@@ repeat
  
  static struct page *do_read_cache_page(struct address_space *mapping,
  				pgoff_t index,
 -				int (*filler)(void *,struct page*),
 +				int (*filler)(void *, struct page *),
  				void *data,
  				gfp_t gfp)
  
@@@ -1865,7 -1866,7 +1865,7 @@@ out
   * @mapping:	the page's address_space
   * @index:	the page index
   * @filler:	function to perform the read
 - * @data:	destination for read data
 + * @data:	first arg to filler(data, page) function, often left as NULL
   *
   * Same as read_cache_page, but don't wait for page to become unlocked
   * after submitting it to the filler.
@@@ -1877,7 -1878,7 +1877,7 @@@
   */
  struct page *read_cache_page_async(struct address_space *mapping,
  				pgoff_t index,
 -				int (*filler)(void *,struct page*),
 +				int (*filler)(void *, struct page *),
  				void *data)
  {
  	return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
@@@ -1925,7 -1926,7 +1925,7 @@@ EXPORT_SYMBOL(read_cache_page_gfp)
   * @mapping:	the page's address_space
   * @index:	the page index
   * @filler:	function to perform the read
 - * @data:	destination for read data
 + * @data:	first arg to filler(data, page) function, often left as NULL
   *
   * Read into the page cache. If a page already exists, and PageUptodate() is
   * not set, try to fill the page then wait for it to become unlocked.
@@@ -1934,7 -1935,7 +1934,7 @@@
   */
  struct page *read_cache_page(struct address_space *mapping,
  				pgoff_t index,
 -				int (*filler)(void *,struct page*),
 +				int (*filler)(void *, struct page *),
  				void *data)
  {
  	return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
@@@ -1999,7 -2000,7 +1999,7 @@@ int file_remove_suid(struct file *file
  		error = security_inode_killpriv(dentry);
  	if (!error && killsuid)
  		error = __remove_suid(dentry, killsuid);
 -	if (!error)
 +	if (!error && (inode->i_sb->s_flags & MS_NOSEC))
  		inode->i_flags |= S_NOSEC;
  
  	return error;
diff --combined mm/page-writeback.c
index d8767b381b9c,1d781803e629..d1960744f881
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@@ -36,6 -36,16 +36,16 @@@
  #include <linux/pagevec.h>
  #include <trace/events/writeback.h>
  
+ /*
+  * Sleep at most 200ms at a time in balance_dirty_pages().
+  */
+ #define MAX_PAUSE		max(HZ/5, 1)
+ 
+ /*
+  * Estimate write bandwidth at 200ms intervals.
+  */
+ #define BANDWIDTH_INTERVAL	max(HZ/5, 1)
+ 
  /*
   * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
   * will look to see if it needs to force writeback or throttling.
@@@ -111,6 -121,7 +121,7 @@@ EXPORT_SYMBOL(laptop_mode)
  
  /* End of sysctl-exported parameters */
  
+ unsigned long global_dirty_limit;
  
  /*
   * Scale the writeback cache size proportional to the relative writeout speeds.
@@@ -219,6 -230,7 +230,7 @@@ int dirty_bytes_handler(struct ctl_tabl
   */
  static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
  {
+ 	__inc_bdi_stat(bdi, BDI_WRITTEN);
  	__prop_inc_percpu_max(&vm_completions, &bdi->completions,
  			      bdi->max_prop_frac);
  }
@@@ -244,13 -256,8 +256,8 @@@ void task_dirty_inc(struct task_struct 
  static void bdi_writeout_fraction(struct backing_dev_info *bdi,
  		long *numerator, long *denominator)
  {
- 	if (bdi_cap_writeback_dirty(bdi)) {
- 		prop_fraction_percpu(&vm_completions, &bdi->completions,
+ 	prop_fraction_percpu(&vm_completions, &bdi->completions,
  				numerator, denominator);
- 	} else {
- 		*numerator = 0;
- 		*denominator = 1;
- 	}
  }
  
  static inline void task_dirties_fraction(struct task_struct *tsk,
@@@ -274,12 -281,13 +281,13 @@@
   * effectively curb the growth of dirty pages. Light dirtiers with high enough
   * dirty threshold may never get throttled.
   */
+ #define TASK_LIMIT_FRACTION 8
  static unsigned long task_dirty_limit(struct task_struct *tsk,
  				       unsigned long bdi_dirty)
  {
  	long numerator, denominator;
  	unsigned long dirty = bdi_dirty;
- 	u64 inv = dirty >> 3;
+ 	u64 inv = dirty / TASK_LIMIT_FRACTION;
  
  	task_dirties_fraction(tsk, &numerator, &denominator);
  	inv *= numerator;
@@@ -290,6 -298,12 +298,12 @@@
  	return max(dirty, bdi_dirty/2);
  }
  
+ /* Minimum limit for any task */
+ static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
+ {
+ 	return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
+ }
+ 
  /*
   *
   */
@@@ -397,6 -411,11 +411,11 @@@ unsigned long determine_dirtyable_memor
  	return x + 1;	/* Ensure that we never return 0 */
  }
  
+ static unsigned long hard_dirty_limit(unsigned long thresh)
+ {
+ 	return max(thresh, global_dirty_limit);
+ }
+ 
  /*
   * global_dirty_limits - background-writeback and dirty-throttling thresholds
   *
@@@ -435,12 -454,20 +454,20 @@@ void global_dirty_limits(unsigned long 
  	}
  	*pbackground = background;
  	*pdirty = dirty;
+ 	trace_global_dirty_state(background, dirty);
  }
  
- /*
+ /**
   * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+  * @bdi: the backing_dev_info to query
+  * @dirty: global dirty limit in pages
   *
-  * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+  * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+  * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+  * And the "limit" in the name is not seriously taken as hard limit in
+  * balance_dirty_pages().
+  *
+  * It allocates high/low dirty limits to fast/slow devices, in order to prevent
   * - starving fast devices
   * - piling up dirty pages (that will take long time to sync) on slow devices
   *
@@@ -468,6 -495,153 +495,153 @@@ unsigned long bdi_dirty_limit(struct ba
  	return bdi_dirty;
  }
  
+ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+ 				       unsigned long elapsed,
+ 				       unsigned long written)
+ {
+ 	const unsigned long period = roundup_pow_of_two(3 * HZ);
+ 	unsigned long avg = bdi->avg_write_bandwidth;
+ 	unsigned long old = bdi->write_bandwidth;
+ 	u64 bw;
+ 
+ 	/*
+ 	 * bw = written * HZ / elapsed
+ 	 *
+ 	 *                   bw * elapsed + write_bandwidth * (period - elapsed)
+ 	 * write_bandwidth = ---------------------------------------------------
+ 	 *                                          period
+ 	 */
+ 	bw = written - bdi->written_stamp;
+ 	bw *= HZ;
+ 	if (unlikely(elapsed > period)) {
+ 		do_div(bw, elapsed);
+ 		avg = bw;
+ 		goto out;
+ 	}
+ 	bw += (u64)bdi->write_bandwidth * (period - elapsed);
+ 	bw >>= ilog2(period);
+ 
+ 	/*
+ 	 * one more level of smoothing, for filtering out sudden spikes
+ 	 */
+ 	if (avg > old && old >= (unsigned long)bw)
+ 		avg -= (avg - old) >> 3;
+ 
+ 	if (avg < old && old <= (unsigned long)bw)
+ 		avg += (old - avg) >> 3;
+ 
+ out:
+ 	bdi->write_bandwidth = bw;
+ 	bdi->avg_write_bandwidth = avg;
+ }
+ 
+ /*
+  * The global dirtyable memory and dirty threshold could be suddenly knocked
+  * down by a large amount (eg. on the startup of KVM in a swapless system).
+  * This may throw the system into deep dirty exceeded state and throttle
+  * heavy/light dirtiers alike. To retain good responsiveness, maintain
+  * global_dirty_limit for tracking slowly down to the knocked down dirty
+  * threshold.
+  */
+ static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+ {
+ 	unsigned long limit = global_dirty_limit;
+ 
+ 	/*
+ 	 * Follow up in one step.
+ 	 */
+ 	if (limit < thresh) {
+ 		limit = thresh;
+ 		goto update;
+ 	}
+ 
+ 	/*
+ 	 * Follow down slowly. Use the higher one as the target, because thresh
+ 	 * may drop below dirty. This is exactly the reason to introduce
+ 	 * global_dirty_limit which is guaranteed to lie above the dirty pages.
+ 	 */
+ 	thresh = max(thresh, dirty);
+ 	if (limit > thresh) {
+ 		limit -= (limit - thresh) >> 5;
+ 		goto update;
+ 	}
+ 	return;
+ update:
+ 	global_dirty_limit = limit;
+ }
+ 
+ static void global_update_bandwidth(unsigned long thresh,
+ 				    unsigned long dirty,
+ 				    unsigned long now)
+ {
+ 	static DEFINE_SPINLOCK(dirty_lock);
+ 	static unsigned long update_time;
+ 
+ 	/*
+ 	 * check locklessly first to optimize away locking for the most time
+ 	 */
+ 	if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+ 		return;
+ 
+ 	spin_lock(&dirty_lock);
+ 	if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
+ 		update_dirty_limit(thresh, dirty);
+ 		update_time = now;
+ 	}
+ 	spin_unlock(&dirty_lock);
+ }
+ 
+ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+ 			    unsigned long thresh,
+ 			    unsigned long dirty,
+ 			    unsigned long bdi_thresh,
+ 			    unsigned long bdi_dirty,
+ 			    unsigned long start_time)
+ {
+ 	unsigned long now = jiffies;
+ 	unsigned long elapsed = now - bdi->bw_time_stamp;
+ 	unsigned long written;
+ 
+ 	/*
+ 	 * rate-limit, only update once every 200ms.
+ 	 */
+ 	if (elapsed < BANDWIDTH_INTERVAL)
+ 		return;
+ 
+ 	written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+ 
+ 	/*
+ 	 * Skip quiet periods when disk bandwidth is under-utilized.
+ 	 * (at least 1s idle time between two flusher runs)
+ 	 */
+ 	if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+ 		goto snapshot;
+ 
+ 	if (thresh)
+ 		global_update_bandwidth(thresh, dirty, now);
+ 
+ 	bdi_update_write_bandwidth(bdi, elapsed, written);
+ 
+ snapshot:
+ 	bdi->written_stamp = written;
+ 	bdi->bw_time_stamp = now;
+ }
+ 
+ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+ 				 unsigned long thresh,
+ 				 unsigned long dirty,
+ 				 unsigned long bdi_thresh,
+ 				 unsigned long bdi_dirty,
+ 				 unsigned long start_time)
+ {
+ 	if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+ 		return;
+ 	spin_lock(&bdi->wb.list_lock);
+ 	__bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
+ 			       start_time);
+ 	spin_unlock(&bdi->wb.list_lock);
+ }
+ 
  /*
   * balance_dirty_pages() must be called by processes which are generating dirty
   * data.  It looks at the number of dirty pages in the machine and will force
@@@ -478,27 -652,25 +652,25 @@@
  static void balance_dirty_pages(struct address_space *mapping,
  				unsigned long write_chunk)
  {
- 	long nr_reclaimable, bdi_nr_reclaimable;
- 	long nr_writeback, bdi_nr_writeback;
+ 	unsigned long nr_reclaimable, bdi_nr_reclaimable;
+ 	unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
+ 	unsigned long bdi_dirty;
  	unsigned long background_thresh;
  	unsigned long dirty_thresh;
  	unsigned long bdi_thresh;
+ 	unsigned long task_bdi_thresh;
+ 	unsigned long min_task_bdi_thresh;
  	unsigned long pages_written = 0;
  	unsigned long pause = 1;
  	bool dirty_exceeded = false;
+ 	bool clear_dirty_exceeded = true;
  	struct backing_dev_info *bdi = mapping->backing_dev_info;
+ 	unsigned long start_time = jiffies;
  
  	for (;;) {
- 		struct writeback_control wbc = {
- 			.sync_mode	= WB_SYNC_NONE,
- 			.older_than_this = NULL,
- 			.nr_to_write	= write_chunk,
- 			.range_cyclic	= 1,
- 		};
- 
  		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
  					global_page_state(NR_UNSTABLE_NFS);
- 		nr_writeback = global_page_state(NR_WRITEBACK);
+ 		nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
  
  		global_dirty_limits(&background_thresh, &dirty_thresh);
  
@@@ -507,12 -679,12 +679,12 @@@
  		 * catch-up. This avoids (excessively) small writeouts
  		 * when the bdi limits are ramping up.
  		 */
- 		if (nr_reclaimable + nr_writeback <=
- 				(background_thresh + dirty_thresh) / 2)
+ 		if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
  			break;
  
  		bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
- 		bdi_thresh = task_dirty_limit(current, bdi_thresh);
+ 		min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
+ 		task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
  
  		/*
  		 * In order to avoid the stacked BDI deadlock we need
@@@ -524,12 -696,14 +696,14 @@@
  		 * actually dirty; with m+n sitting in the percpu
  		 * deltas.
  		 */
- 		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+ 		if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
  			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
- 			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+ 			bdi_dirty = bdi_nr_reclaimable +
+ 				    bdi_stat_sum(bdi, BDI_WRITEBACK);
  		} else {
  			bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- 			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+ 			bdi_dirty = bdi_nr_reclaimable +
+ 				    bdi_stat(bdi, BDI_WRITEBACK);
  		}
  
  		/*
@@@ -538,9 -712,10 +712,10 @@@
  		 * bdi or process from holding back light ones; The latter is
  		 * the last resort safeguard.
  		 */
- 		dirty_exceeded =
- 			(bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
- 			|| (nr_reclaimable + nr_writeback > dirty_thresh);
+ 		dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
+ 				  (nr_dirty > dirty_thresh);
+ 		clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
+ 					(nr_dirty <= dirty_thresh);
  
  		if (!dirty_exceeded)
  			break;
@@@ -548,6 -723,9 +723,9 @@@
  		if (!bdi->dirty_exceeded)
  			bdi->dirty_exceeded = 1;
  
+ 		bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
+ 				     bdi_thresh, bdi_dirty, start_time);
+ 
  		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
  		 * Unstable writes are a feature of certain networked
  		 * filesystems (i.e. NFS) in which data may have been
@@@ -557,17 -735,40 +735,40 @@@
  		 * threshold otherwise wait until the disk writes catch
  		 * up.
  		 */
- 		trace_wbc_balance_dirty_start(&wbc, bdi);
- 		if (bdi_nr_reclaimable > bdi_thresh) {
- 			writeback_inodes_wb(&bdi->wb, &wbc);
- 			pages_written += write_chunk - wbc.nr_to_write;
- 			trace_wbc_balance_dirty_written(&wbc, bdi);
+ 		trace_balance_dirty_start(bdi);
+ 		if (bdi_nr_reclaimable > task_bdi_thresh) {
+ 			pages_written += writeback_inodes_wb(&bdi->wb,
+ 							     write_chunk);
+ 			trace_balance_dirty_written(bdi, pages_written);
  			if (pages_written >= write_chunk)
  				break;		/* We've done our duty */
  		}
- 		trace_wbc_balance_dirty_wait(&wbc, bdi);
  		__set_current_state(TASK_UNINTERRUPTIBLE);
  		io_schedule_timeout(pause);
+ 		trace_balance_dirty_wait(bdi);
+ 
+ 		dirty_thresh = hard_dirty_limit(dirty_thresh);
+ 		/*
+ 		 * max-pause area. If dirty exceeded but still within this
+ 		 * area, no need to sleep for more than 200ms: (a) 8 pages per
+ 		 * 200ms is typically more than enough to curb heavy dirtiers;
+ 		 * (b) the pause time limit makes the dirtiers more responsive.
+ 		 */
+ 		if (nr_dirty < dirty_thresh +
+ 			       dirty_thresh / DIRTY_MAXPAUSE_AREA &&
+ 		    time_after(jiffies, start_time + MAX_PAUSE))
+ 			break;
+ 		/*
+ 		 * pass-good area. When some bdi gets blocked (eg. NFS server
+ 		 * not responding), or write bandwidth dropped dramatically due
+ 		 * to concurrent reads, or dirty threshold suddenly dropped and
+ 		 * the dirty pages cannot be brought down anytime soon (eg. on
+ 		 * slow USB stick), at least let go of the good bdi's.
+ 		 */
+ 		if (nr_dirty < dirty_thresh +
+ 			       dirty_thresh / DIRTY_PASSGOOD_AREA &&
+ 		    bdi_dirty < bdi_thresh)
+ 			break;
  
  		/*
  		 * Increase the delay for each loop, up to our previous
@@@ -578,7 -779,8 +779,8 @@@
  			pause = HZ / 10;
  	}
  
- 	if (!dirty_exceeded && bdi->dirty_exceeded)
+ 	/* Clear dirty_exceeded flag only when no task can exceed the limit */
+ 	if (clear_dirty_exceeded && bdi->dirty_exceeded)
  		bdi->dirty_exceeded = 0;
  
  	if (writeback_in_progress(bdi))
@@@ -626,9 -828,13 +828,13 @@@ static DEFINE_PER_CPU(unsigned long, bd
  void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
  					unsigned long nr_pages_dirtied)
  {
+ 	struct backing_dev_info *bdi = mapping->backing_dev_info;
  	unsigned long ratelimit;
  	unsigned long *p;
  
+ 	if (!bdi_cap_account_dirty(bdi))
+ 		return;
+ 
  	ratelimit = ratelimit_pages;
  	if (mapping->backing_dev_info->dirty_exceeded)
  		ratelimit = 8;
@@@ -892,12 -1098,12 +1098,12 @@@ int write_cache_pages(struct address_sp
  			range_whole = 1;
  		cycled = 1; /* ignore range_cyclic tests */
  	}
- 	if (wbc->sync_mode == WB_SYNC_ALL)
+ 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
  		tag = PAGECACHE_TAG_TOWRITE;
  	else
  		tag = PAGECACHE_TAG_DIRTY;
  retry:
- 	if (wbc->sync_mode == WB_SYNC_ALL)
+ 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
  		tag_pages_for_writeback(mapping, index, end);
  	done_index = index;
  	while (!done && (index <= end)) {
@@@ -1141,6 -1347,7 +1347,6 @@@ EXPORT_SYMBOL(account_page_dirtied)
  void account_page_writeback(struct page *page)
  {
  	inc_zone_page_state(page, NR_WRITEBACK);
 -	inc_zone_page_state(page, NR_WRITTEN);
  }
  EXPORT_SYMBOL(account_page_writeback);
  
@@@ -1357,10 -1564,8 +1563,10 @@@ int test_clear_page_writeback(struct pa
  	} else {
  		ret = TestClearPageWriteback(page);
  	}
 -	if (ret)
 +	if (ret) {
  		dec_zone_page_state(page, NR_WRITEBACK);
 +		inc_zone_page_state(page, NR_WRITTEN);
 +	}
  	return ret;
  }
  
@@@ -1406,6 -1611,10 +1612,6 @@@ EXPORT_SYMBOL(test_set_page_writeback)
   */
  int mapping_tagged(struct address_space *mapping, int tag)
  {
 -	int ret;
 -	rcu_read_lock();
 -	ret = radix_tree_tagged(&mapping->page_tree, tag);
 -	rcu_read_unlock();
 -	return ret;
 +	return radix_tree_tagged(&mapping->page_tree, tag);
  }
  EXPORT_SYMBOL(mapping_tagged);
diff --combined mm/rmap.c
index 9701574bb67a,d04e36a7cc9f..8005080fb9e3
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@@ -21,6 -21,7 +21,6 @@@
   * Lock ordering in mm:
   *
   * inode->i_mutex	(while writing or truncating, not reading or faulting)
 - *   inode->i_alloc_sem (vmtruncate_range)
   *   mm->mmap_sem
   *     page->flags PG_locked (lock_page)
   *       mapping->i_mmap_mutex
@@@ -31,14 -32,15 +31,14 @@@
   *               mmlist_lock (in mmput, drain_mmlist and others)
   *               mapping->private_lock (in __set_page_dirty_buffers)
   *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
-  *               inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
+  *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
   *                 sb_lock (within inode_lock in fs/fs-writeback.c)
   *                 mapping->tree_lock (widely used, in set_page_dirty,
   *                           in arch-dependent flush_dcache_mmap_lock,
-  *                           within inode_wb_list_lock in __sync_single_inode)
+  *                           within bdi.wb->list_lock in __sync_single_inode)
   *
 - * (code doesn't rely on that order so it could be switched around)
 - * ->tasklist_lock
 - *   anon_vma->mutex      (memory_failure, collect_procs_anon)
 + * anon_vma->mutex,mapping->i_mutex      (memory_failure, collect_procs_anon)
 + *   ->tasklist_lock
   *     pte map lock
   */
  
@@@ -110,9 -112,9 +110,9 @@@ static inline void anon_vma_free(struc
  	kmem_cache_free(anon_vma_cachep, anon_vma);
  }
  
 -static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
 +static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
  {
 -	return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
 +	return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
  }
  
  static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
@@@ -157,7 -159,7 +157,7 @@@ int anon_vma_prepare(struct vm_area_str
  		struct mm_struct *mm = vma->vm_mm;
  		struct anon_vma *allocated;
  
 -		avc = anon_vma_chain_alloc();
 +		avc = anon_vma_chain_alloc(GFP_KERNEL);
  		if (!avc)
  			goto out_enomem;
  
@@@ -198,32 -200,6 +198,32 @@@
  	return -ENOMEM;
  }
  
 +/*
 + * This is a useful helper function for locking the anon_vma root as
 + * we traverse the vma->anon_vma_chain, looping over anon_vma's that
 + * have the same vma.
 + *
 + * Such anon_vma's should have the same root, so you'd expect to see
 + * just a single mutex_lock for the whole traversal.
 + */
 +static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
 +{
 +	struct anon_vma *new_root = anon_vma->root;
 +	if (new_root != root) {
 +		if (WARN_ON_ONCE(root))
 +			mutex_unlock(&root->mutex);
 +		root = new_root;
 +		mutex_lock(&root->mutex);
 +	}
 +	return root;
 +}
 +
 +static inline void unlock_anon_vma_root(struct anon_vma *root)
 +{
 +	if (root)
 +		mutex_unlock(&root->mutex);
 +}
 +
  static void anon_vma_chain_link(struct vm_area_struct *vma,
  				struct anon_vma_chain *avc,
  				struct anon_vma *anon_vma)
@@@ -232,11 -208,13 +232,11 @@@
  	avc->anon_vma = anon_vma;
  	list_add(&avc->same_vma, &vma->anon_vma_chain);
  
 -	anon_vma_lock(anon_vma);
  	/*
  	 * It's critical to add new vmas to the tail of the anon_vma,
  	 * see comment in huge_memory.c:__split_huge_page().
  	 */
  	list_add_tail(&avc->same_anon_vma, &anon_vma->head);
 -	anon_vma_unlock(anon_vma);
  }
  
  /*
@@@ -246,24 -224,13 +246,24 @@@
  int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
  {
  	struct anon_vma_chain *avc, *pavc;
 +	struct anon_vma *root = NULL;
  
  	list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
 -		avc = anon_vma_chain_alloc();
 -		if (!avc)
 -			goto enomem_failure;
 -		anon_vma_chain_link(dst, avc, pavc->anon_vma);
 +		struct anon_vma *anon_vma;
 +
 +		avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
 +		if (unlikely(!avc)) {
 +			unlock_anon_vma_root(root);
 +			root = NULL;
 +			avc = anon_vma_chain_alloc(GFP_KERNEL);
 +			if (!avc)
 +				goto enomem_failure;
 +		}
 +		anon_vma = pavc->anon_vma;
 +		root = lock_anon_vma_root(root, anon_vma);
 +		anon_vma_chain_link(dst, avc, anon_vma);
  	}
 +	unlock_anon_vma_root(root);
  	return 0;
  
   enomem_failure:
@@@ -296,7 -263,7 +296,7 @@@ int anon_vma_fork(struct vm_area_struc
  	anon_vma = anon_vma_alloc();
  	if (!anon_vma)
  		goto out_error;
 -	avc = anon_vma_chain_alloc();
 +	avc = anon_vma_chain_alloc(GFP_KERNEL);
  	if (!avc)
  		goto out_error_free_anon_vma;
  
@@@ -313,9 -280,7 +313,9 @@@
  	get_anon_vma(anon_vma->root);
  	/* Mark this anon_vma as the one where our new (COWed) pages go. */
  	vma->anon_vma = anon_vma;
 +	anon_vma_lock(anon_vma);
  	anon_vma_chain_link(vma, avc, anon_vma);
 +	anon_vma_unlock(anon_vma);
  
  	return 0;
  
@@@ -326,43 -291,36 +326,43 @@@
  	return -ENOMEM;
  }
  
 -static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
 -{
 -	struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
 -	int empty;
 -
 -	/* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
 -	if (!anon_vma)
 -		return;
 -
 -	anon_vma_lock(anon_vma);
 -	list_del(&anon_vma_chain->same_anon_vma);
 -
 -	/* We must garbage collect the anon_vma if it's empty */
 -	empty = list_empty(&anon_vma->head);
 -	anon_vma_unlock(anon_vma);
 -
 -	if (empty)
 -		put_anon_vma(anon_vma);
 -}
 -
  void unlink_anon_vmas(struct vm_area_struct *vma)
  {
  	struct anon_vma_chain *avc, *next;
 +	struct anon_vma *root = NULL;
  
  	/*
  	 * Unlink each anon_vma chained to the VMA.  This list is ordered
  	 * from newest to oldest, ensuring the root anon_vma gets freed last.
  	 */
  	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
 -		anon_vma_unlink(avc);
 +		struct anon_vma *anon_vma = avc->anon_vma;
 +
 +		root = lock_anon_vma_root(root, anon_vma);
 +		list_del(&avc->same_anon_vma);
 +
 +		/*
 +		 * Leave empty anon_vmas on the list - we'll need
 +		 * to free them outside the lock.
 +		 */
 +		if (list_empty(&anon_vma->head))
 +			continue;
 +
 +		list_del(&avc->same_vma);
 +		anon_vma_chain_free(avc);
 +	}
 +	unlock_anon_vma_root(root);
 +
 +	/*
 +	 * Iterate the list once more, it now only contains empty and unlinked
 +	 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
 +	 * needing to acquire the anon_vma->root->mutex.
 +	 */
 +	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
 +		struct anon_vma *anon_vma = avc->anon_vma;
 +
 +		put_anon_vma(anon_vma);
 +
  		list_del(&avc->same_vma);
  		anon_vma_chain_free(avc);
  	}
@@@ -869,11 -827,11 +869,11 @@@ int page_referenced(struct page *page
  								vm_flags);
  		if (we_locked)
  			unlock_page(page);
 +
 +		if (page_test_and_clear_young(page_to_pfn(page)))
 +			referenced++;
  	}
  out:
 -	if (page_test_and_clear_young(page_to_pfn(page)))
 -		referenced++;
 -
  	return referenced;
  }