Merge branch 'iocb' into for-next
authorAl Viro <viro@zeniv.linux.org.uk>
Sun, 12 Apr 2015 02:24:41 +0000 (22:24 -0400)
committerAl Viro <viro@zeniv.linux.org.uk>
Sun, 12 Apr 2015 02:24:41 +0000 (22:24 -0400)
20 files changed:
1  2 
arch/s390/hypfs/inode.c
drivers/infiniband/hw/qib/qib_file_ops.c
drivers/scsi/sg.c
fs/aio.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/ceph/file.c
fs/ecryptfs/file.c
fs/ext4/indirect.c
fs/ext4/inode.c
fs/fuse/dev.c
fs/nfs/direct.c
fs/nfs/file.c
fs/ntfs/file.c
fs/xfs/xfs_file.c
include/linux/fs.h
kernel/printk/printk.c
mm/shmem.c
net/socket.c
sound/core/pcm_native.c

diff --combined arch/s390/hypfs/inode.c
@@@ -21,7 -21,7 +21,7 @@@
  #include <linux/module.h>
  #include <linux/seq_file.h>
  #include <linux/mount.h>
- #include <linux/aio.h>
+ #include <linux/uio.h>
  #include <asm/ebcdic.h>
  #include "hypfs.h"
  
@@@ -74,7 -74,7 +74,7 @@@ static void hypfs_remove(struct dentry 
        parent = dentry->d_parent;
        mutex_lock(&parent->d_inode->i_mutex);
        if (hypfs_positive(dentry)) {
 -              if (S_ISDIR(dentry->d_inode->i_mode))
 +              if (d_is_dir(dentry))
                        simple_rmdir(parent->d_inode, dentry);
                else
                        simple_unlink(parent->d_inode, dentry);
@@@ -144,32 -144,36 +144,32 @@@ static int hypfs_open(struct inode *ino
        return nonseekable_open(inode, filp);
  }
  
 -static ssize_t hypfs_aio_read(struct kiocb *iocb, const struct iovec *iov,
 -                            unsigned long nr_segs, loff_t offset)
 +static ssize_t hypfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
  {
 -      char *data;
 -      ssize_t ret;
 -      struct file *filp = iocb->ki_filp;
 -      /* XXX: temporary */
 -      char __user *buf = iov[0].iov_base;
 -      size_t count = iov[0].iov_len;
 -
 -      if (nr_segs != 1)
 -              return -EINVAL;
 -
 -      data = filp->private_data;
 -      ret = simple_read_from_buffer(buf, count, &offset, data, strlen(data));
 -      if (ret <= 0)
 -              return ret;
 +      struct file *file = iocb->ki_filp;
 +      char *data = file->private_data;
 +      size_t available = strlen(data);
 +      loff_t pos = iocb->ki_pos;
 +      size_t count;
  
 -      iocb->ki_pos += ret;
 -      file_accessed(filp);
 -
 -      return ret;
 +      if (pos < 0)
 +              return -EINVAL;
 +      if (pos >= available || !iov_iter_count(to))
 +              return 0;
 +      count = copy_to_iter(data + pos, available - pos, to);
 +      if (!count)
 +              return -EFAULT;
 +      iocb->ki_pos = pos + count;
 +      file_accessed(file);
 +      return count;
  }
 -static ssize_t hypfs_aio_write(struct kiocb *iocb, const struct iovec *iov,
 -                            unsigned long nr_segs, loff_t offset)
 +
 +static ssize_t hypfs_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
        int rc;
        struct super_block *sb = file_inode(iocb->ki_filp)->i_sb;
        struct hypfs_sb_info *fs_info = sb->s_fs_info;
 -      size_t count = iov_length(iov, nr_segs);
 +      size_t count = iov_iter_count(from);
  
        /*
         * Currently we only allow one update per second for two reasons:
        }
        hypfs_update_update(sb);
        rc = count;
 +      iov_iter_advance(from, count);
  out:
        mutex_unlock(&fs_info->lock);
        return rc;
@@@ -437,10 -440,10 +437,10 @@@ struct dentry *hypfs_create_str(struct 
  static const struct file_operations hypfs_file_ops = {
        .open           = hypfs_open,
        .release        = hypfs_release,
 -      .read           = do_sync_read,
 -      .write          = do_sync_write,
 -      .aio_read       = hypfs_aio_read,
 -      .aio_write      = hypfs_aio_write,
 +      .read           = new_sync_read,
 +      .write          = new_sync_write,
 +      .read_iter      = hypfs_read_iter,
 +      .write_iter     = hypfs_write_iter,
        .llseek         = no_llseek,
  };
  
@@@ -39,7 -39,6 +39,6 @@@
  #include <linux/vmalloc.h>
  #include <linux/highmem.h>
  #include <linux/io.h>
- #include <linux/aio.h>
  #include <linux/jiffies.h>
  #include <asm/pgtable.h>
  #include <linux/delay.h>
@@@ -351,10 -350,9 +350,10 @@@ static int qib_tid_update(struct qib_ct
                 * unless perhaps the user has mpin'ed the pages
                 * themselves.
                 */
 -              qib_devinfo(dd->pcidev,
 -                       "Failed to lock addr %p, %u pages: "
 -                       "errno %d\n", (void *) vaddr, cnt, -ret);
 +              qib_devinfo(
 +                      dd->pcidev,
 +                      "Failed to lock addr %p, %u pages: errno %d\n",
 +                      (void *) vaddr, cnt, -ret);
                goto done;
        }
        for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
@@@ -438,7 -436,7 +437,7 @@@ cleanup
                        goto cleanup;
                }
                if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
 -                               tidmap, sizeof tidmap)) {
 +                               tidmap, sizeof(tidmap))) {
                        ret = -EFAULT;
                        goto cleanup;
                }
@@@ -485,7 -483,7 +484,7 @@@ static int qib_tid_free(struct qib_ctxt
        }
  
        if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
 -                         sizeof tidmap)) {
 +                         sizeof(tidmap))) {
                ret = -EFAULT;
                goto done;
        }
@@@ -952,8 -950,8 +951,8 @@@ static int mmap_kvaddr(struct vm_area_s
                /* rcvegrbufs are read-only on the slave */
                if (vma->vm_flags & VM_WRITE) {
                        qib_devinfo(dd->pcidev,
 -                               "Can't map eager buffers as "
 -                               "writable (flags=%lx)\n", vma->vm_flags);
 +                               "Can't map eager buffers as writable (flags=%lx)\n",
 +                               vma->vm_flags);
                        ret = -EPERM;
                        goto bail;
                }
@@@ -1186,7 -1184,6 +1185,7 @@@ static void assign_ctxt_affinity(struc
         */
        if (weight >= qib_cpulist_count) {
                int cpu;
 +
                cpu = find_first_zero_bit(qib_cpulist,
                                          qib_cpulist_count);
                if (cpu == qib_cpulist_count)
@@@ -1249,7 -1246,10 +1248,7 @@@ static int init_subctxts(struct qib_dev
        if (!qib_compatible_subctxts(uinfo->spu_userversion >> 16,
                uinfo->spu_userversion & 0xffff)) {
                qib_devinfo(dd->pcidev,
 -                       "Mismatched user version (%d.%d) and driver "
 -                       "version (%d.%d) while context sharing. Ensure "
 -                       "that driver and library are from the same "
 -                       "release.\n",
 +                       "Mismatched user version (%d.%d) and driver version (%d.%d) while context sharing. Ensure that driver and library are from the same release.\n",
                         (int) (uinfo->spu_userversion >> 16),
                         (int) (uinfo->spu_userversion & 0xffff),
                         QIB_USER_SWMAJOR, QIB_USER_SWMINOR);
@@@ -1390,7 -1390,6 +1389,7 @@@ static int choose_port_ctxt(struct fil
        }
        if (!ppd) {
                u32 pidx = ctxt % dd->num_pports;
 +
                if (usable(dd->pport + pidx))
                        ppd = dd->pport + pidx;
                else {
@@@ -1438,12 -1437,10 +1437,12 @@@ static int get_a_ctxt(struct file *fp, 
  
        if (alg == QIB_PORT_ALG_ACROSS) {
                unsigned inuse = ~0U;
 +
                /* find device (with ACTIVE ports) with fewest ctxts in use */
                for (ndev = 0; ndev < devmax; ndev++) {
                        struct qib_devdata *dd = qib_lookup(ndev);
                        unsigned cused = 0, cfree = 0, pusable = 0;
 +
                        if (!dd)
                                continue;
                        if (port && port <= dd->num_pports &&
        } else {
                for (ndev = 0; ndev < devmax; ndev++) {
                        struct qib_devdata *dd = qib_lookup(ndev);
 +
                        if (dd) {
                                ret = choose_port_ctxt(fp, dd, port, uinfo);
                                if (!ret)
@@@ -1559,7 -1555,6 +1558,7 @@@ static int find_hca(unsigned int cpu, i
        }
        for (ndev = 0; ndev < devmax; ndev++) {
                struct qib_devdata *dd = qib_lookup(ndev);
 +
                if (dd) {
                        if (pcibus_to_node(dd->pcidev->bus) < 0) {
                                ret = -EINVAL;
diff --combined drivers/scsi/sg.c
@@@ -33,7 -33,6 +33,6 @@@ static int sg_version_num = 30536;    /* 
  #include <linux/sched.h>
  #include <linux/string.h>
  #include <linux/mm.h>
- #include <linux/aio.h>
  #include <linux/errno.h>
  #include <linux/mtio.h>
  #include <linux/ioctl.h>
@@@ -51,6 -50,7 +50,7 @@@
  #include <linux/mutex.h>
  #include <linux/atomic.h>
  #include <linux/ratelimit.h>
+ #include <linux/uio.h>
  
  #include "scsi.h"
  #include <scsi/scsi_dbg.h>
@@@ -546,7 -546,7 +546,7 @@@ static ssize_
  sg_new_read(Sg_fd * sfp, char __user *buf, size_t count, Sg_request * srp)
  {
        sg_io_hdr_t *hp = &srp->header;
 -      int err = 0;
 +      int err = 0, err2;
        int len;
  
        if (count < SZ_SG_IO_HDR) {
                goto err_out;
        }
  err_out:
 -      err = sg_finish_rem_req(srp);
 -      return (0 == err) ? count : err;
 +      err2 = sg_finish_rem_req(srp);
 +      return err ? : err2 ? : count;
  }
  
  static ssize_t
@@@ -1335,17 -1335,6 +1335,17 @@@ sg_rq_end_io(struct request *rq, int up
        }
        /* Rely on write phase to clean out srp status values, so no "else" */
  
 +      /*
 +       * Free the request as soon as it is complete so that its resources
 +       * can be reused without waiting for userspace to read() the
 +       * result.  But keep the associated bio (if any) around until
 +       * blk_rq_unmap_user() can be called from user context.
 +       */
 +      srp->rq = NULL;
 +      if (rq->cmd != rq->__cmd)
 +              kfree(rq->cmd);
 +      __blk_put_request(rq->q, rq);
 +
        write_lock_irqsave(&sfp->rq_list_lock, iflags);
        if (unlikely(srp->orphan)) {
                if (sfp->keep_orphan)
@@@ -1680,22 -1669,7 +1680,22 @@@ sg_start_req(Sg_request *srp, unsigned 
                        return -ENOMEM;
        }
  
 -      rq = blk_get_request(q, rw, GFP_ATOMIC);
 +      /*
 +       * NOTE
 +       *
 +       * With scsi-mq enabled, there are a fixed number of preallocated
 +       * requests equal in number to shost->can_queue.  If all of the
 +       * preallocated requests are already in use, then using GFP_ATOMIC with
 +       * blk_get_request() will return -EWOULDBLOCK, whereas using GFP_KERNEL
 +       * will cause blk_get_request() to sleep until an active command
 +       * completes, freeing up a request.  Neither option is ideal, but
 +       * GFP_KERNEL is the better choice to prevent userspace from getting an
 +       * unexpected EWOULDBLOCK.
 +       *
 +       * With scsi-mq disabled, blk_get_request() with GFP_KERNEL usually
 +       * does not sleep except under memory pressure.
 +       */
 +      rq = blk_get_request(q, rw, GFP_KERNEL);
        if (IS_ERR(rq)) {
                kfree(long_cmdp);
                return PTR_ERR(rq);
@@@ -1785,10 -1759,10 +1785,10 @@@ sg_finish_rem_req(Sg_request *srp
        SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sfp->parentdp,
                                      "sg_finish_rem_req: res_used=%d\n",
                                      (int) srp->res_used));
 -      if (srp->rq) {
 -              if (srp->bio)
 -                      ret = blk_rq_unmap_user(srp->bio);
 +      if (srp->bio)
 +              ret = blk_rq_unmap_user(srp->bio);
  
 +      if (srp->rq) {
                if (srp->rq->cmd != srp->rq->__cmd)
                        kfree(srp->rq->cmd);
                blk_put_request(srp->rq);
diff --combined fs/aio.c
+++ b/fs/aio.c
@@@ -151,6 -151,38 +151,38 @@@ struct kioctx 
        unsigned                id;
  };
  
+ /*
+  * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+  * cancelled or completed (this makes a certain amount of sense because
+  * successful cancellation - io_cancel() - does deliver the completion to
+  * userspace).
+  *
+  * And since most things don't implement kiocb cancellation and we'd really like
+  * kiocb completion to be lockless when possible, we use ki_cancel to
+  * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+  * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
+  */
+ #define KIOCB_CANCELLED               ((void *) (~0ULL))
+ struct aio_kiocb {
+       struct kiocb            common;
+       struct kioctx           *ki_ctx;
+       kiocb_cancel_fn         *ki_cancel;
+       struct iocb __user      *ki_user_iocb;  /* user's aiocb */
+       __u64                   ki_user_data;   /* user's data for completion */
+       struct list_head        ki_list;        /* the aio core uses this
+                                                * for cancellation */
+       /*
+        * If the aio_resfd field of the userspace iocb is not zero,
+        * this is the underlying eventfd context to deliver events to.
+        */
+       struct eventfd_ctx      *ki_eventfd;
+ };
  /*------ sysctl variables----*/
  static DEFINE_SPINLOCK(aio_nr_lock);
  unsigned long aio_nr;         /* current system wide number of aio requests */
@@@ -220,7 -252,7 +252,7 @@@ static int __init aio_setup(void
        if (IS_ERR(aio_mnt))
                panic("Failed to create aio fs mount.");
  
-       kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+       kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
  
        pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
@@@ -480,8 -512,9 +512,9 @@@ static int aio_setup_ring(struct kioct
  #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
  #define AIO_EVENTS_OFFSET     (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
  
- void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
  {
+       struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
        struct kioctx *ctx = req->ki_ctx;
        unsigned long flags;
  
  }
  EXPORT_SYMBOL(kiocb_set_cancel_fn);
  
- static int kiocb_cancel(struct kiocb *kiocb)
+ static int kiocb_cancel(struct aio_kiocb *kiocb)
  {
        kiocb_cancel_fn *old, *cancel;
  
                cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
        } while (cancel != old);
  
-       return cancel(kiocb);
+       return cancel(&kiocb->common);
  }
  
  static void free_ioctx(struct work_struct *work)
@@@ -550,13 -583,13 +583,13 @@@ static void free_ioctx_reqs(struct perc
  static void free_ioctx_users(struct percpu_ref *ref)
  {
        struct kioctx *ctx = container_of(ref, struct kioctx, users);
-       struct kiocb *req;
+       struct aio_kiocb *req;
  
        spin_lock_irq(&ctx->ctx_lock);
  
        while (!list_empty(&ctx->active_reqs)) {
                req = list_first_entry(&ctx->active_reqs,
-                                      struct kiocb, ki_list);
+                                      struct aio_kiocb, ki_list);
  
                list_del_init(&req->ki_list);
                kiocb_cancel(req);
@@@ -778,22 -811,6 +811,6 @@@ static int kill_ioctx(struct mm_struct 
        return 0;
  }
  
- /* wait_on_sync_kiocb:
-  *    Waits on the given sync kiocb to complete.
-  */
- ssize_t wait_on_sync_kiocb(struct kiocb *req)
- {
-       while (!req->ki_ctx) {
-               set_current_state(TASK_UNINTERRUPTIBLE);
-               if (req->ki_ctx)
-                       break;
-               io_schedule();
-       }
-       __set_current_state(TASK_RUNNING);
-       return req->ki_user_data;
- }
- EXPORT_SYMBOL(wait_on_sync_kiocb);
  /*
   * exit_aio: called when the last user of mm goes away.  At this point, there is
   * no way for any new requests to be submited or any of the io_* syscalls to be
@@@ -948,9 -965,9 +965,9 @@@ static void user_refill_reqs_available(
   *    Allocate a slot for an aio request.
   * Returns NULL if no requests are free.
   */
- static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
  {
-       struct kiocb *req;
+       struct aio_kiocb *req;
  
        if (!get_reqs_available(ctx)) {
                user_refill_reqs_available(ctx);
@@@ -971,10 -988,10 +988,10 @@@ out_put
        return NULL;
  }
  
- static void kiocb_free(struct kiocb *req)
+ static void kiocb_free(struct aio_kiocb *req)
  {
-       if (req->ki_filp)
-               fput(req->ki_filp);
+       if (req->common.ki_filp)
+               fput(req->common.ki_filp);
        if (req->ki_eventfd != NULL)
                eventfd_ctx_put(req->ki_eventfd);
        kmem_cache_free(kiocb_cachep, req);
@@@ -1010,8 -1027,9 +1027,9 @@@ out
  /* aio_complete
   *    Called when the io request on the given iocb is complete.
   */
void aio_complete(struct kiocb *iocb, long res, long res2)
static void aio_complete(struct kiocb *kiocb, long res, long res2)
  {
+       struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
        struct kioctx   *ctx = iocb->ki_ctx;
        struct aio_ring *ring;
        struct io_event *ev_page, *event;
         *    ref, no other paths have a way to get another ref
         *  - the sync task helpfully left a reference to itself in the iocb
         */
-       if (is_sync_kiocb(iocb)) {
-               iocb->ki_user_data = res;
-               smp_wmb();
-               iocb->ki_ctx = ERR_PTR(-EXDEV);
-               wake_up_process(iocb->ki_obj.tsk);
-               return;
-       }
+       BUG_ON(is_sync_kiocb(kiocb));
  
        if (iocb->ki_list.next) {
                unsigned long flags;
        ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
        event = ev_page + pos % AIO_EVENTS_PER_PAGE;
  
-       event->obj = (u64)(unsigned long)iocb->ki_obj.user;
+       event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
        event->data = iocb->ki_user_data;
        event->res = res;
        event->res2 = res2;
        flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
  
        pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
-                ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+                ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
                 res, res2);
  
        /* after flagging the request as done, we
  
        percpu_ref_put(&ctx->reqs);
  }
- EXPORT_SYMBOL(aio_complete);
  
  /* aio_read_events_ring
   *    Pull an event off of the ioctx's event ring.  Returns the number of
@@@ -1285,7 -1296,7 +1296,7 @@@ SYSCALL_DEFINE2(io_setup, unsigned, nr_
  
        ret = -EINVAL;
        if (unlikely(ctx || nr_events == 0)) {
 -              pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
 +              pr_debug("EINVAL: ctx %lu nr_events %u\n",
                         ctx, nr_events);
                goto out;
        }
@@@ -1333,7 -1344,7 +1344,7 @@@ SYSCALL_DEFINE1(io_destroy, aio_context
  
                return ret;
        }
 -      pr_debug("EINVAL: io_destroy: invalid context id\n");
 +      pr_debug("EINVAL: invalid context id\n");
        return -EINVAL;
  }
  
@@@ -1344,12 -1355,13 +1355,13 @@@ typedef ssize_t (rw_iter_op)(struct kio
  static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
                                     int rw, char __user *buf,
                                     unsigned long *nr_segs,
+                                    size_t *len,
                                     struct iovec **iovec,
                                     bool compat)
  {
        ssize_t ret;
  
-       *nr_segs = kiocb->ki_nbytes;
+       *nr_segs = *len;
  
  #ifdef CONFIG_COMPAT
        if (compat)
        if (ret < 0)
                return ret;
  
-       /* ki_nbytes now reflect bytes instead of segs */
-       kiocb->ki_nbytes = ret;
+       /* len now reflect bytes instead of segs */
+       *len = ret;
        return 0;
  }
  
  static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
                                       int rw, char __user *buf,
                                       unsigned long *nr_segs,
+                                      size_t len,
                                       struct iovec *iovec)
  {
-       if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
+       if (unlikely(!access_ok(!rw, buf, len)))
                return -EFAULT;
  
        iovec->iov_base = buf;
-       iovec->iov_len = kiocb->ki_nbytes;
+       iovec->iov_len = len;
        *nr_segs = 1;
        return 0;
  }
   *    Performs the initial checks and io submission.
   */
  static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
-                           char __user *buf, bool compat)
+                           char __user *buf, size_t len, bool compat)
  {
        struct file *file = req->ki_filp;
        ssize_t ret;
@@@ -1423,21 -1436,21 +1436,21 @@@ rw_common
                if (!rw_op && !iter_op)
                        return -EINVAL;
  
-               ret = (opcode == IOCB_CMD_PREADV ||
-                      opcode == IOCB_CMD_PWRITEV)
-                       ? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
-                                               &iovec, compat)
-                       : aio_setup_single_vector(req, rw, buf, &nr_segs,
-                                                 iovec);
+               if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV)
+                       ret = aio_setup_vectored_rw(req, rw, buf, &nr_segs,
+                                               &len, &iovec, compat);
+               else
+                       ret = aio_setup_single_vector(req, rw, buf, &nr_segs,
+                                                 len, iovec);
                if (!ret)
-                       ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+                       ret = rw_verify_area(rw, file, &req->ki_pos, len);
                if (ret < 0) {
                        if (iovec != inline_vecs)
                                kfree(iovec);
                        return ret;
                }
  
-               req->ki_nbytes = ret;
+               len = ret;
  
                /* XXX: move/kill - rw_verify_area()? */
                /* This matches the pread()/pwrite() logic */
                        file_start_write(file);
  
                if (iter_op) {
-                       iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
+                       iov_iter_init(&iter, rw, iovec, nr_segs, len);
                        ret = iter_op(req, &iter);
                } else {
                        ret = rw_op(req, iovec, nr_segs, req->ki_pos);
  static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                         struct iocb *iocb, bool compat)
  {
-       struct kiocb *req;
+       struct aio_kiocb *req;
        ssize_t ret;
  
        /* enforce forwards compatibility on users */
            (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
            ((ssize_t)iocb->aio_nbytes < 0)
           )) {
 -              pr_debug("EINVAL: io_submit: overflow check\n");
 +              pr_debug("EINVAL: overflow check\n");
                return -EINVAL;
        }
  
        if (unlikely(!req))
                return -EAGAIN;
  
-       req->ki_filp = fget(iocb->aio_fildes);
-       if (unlikely(!req->ki_filp)) {
+       req->common.ki_filp = fget(iocb->aio_fildes);
+       if (unlikely(!req->common.ki_filp)) {
                ret = -EBADF;
                goto out_put_req;
        }
+       req->common.ki_pos = iocb->aio_offset;
+       req->common.ki_complete = aio_complete;
+       req->common.ki_flags = 0;
  
        if (iocb->aio_flags & IOCB_FLAG_RESFD) {
                /*
                        req->ki_eventfd = NULL;
                        goto out_put_req;
                }
+               req->common.ki_flags |= IOCB_EVENTFD;
        }
  
        ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
                goto out_put_req;
        }
  
-       req->ki_obj.user = user_iocb;
+       req->ki_user_iocb = user_iocb;
        req->ki_user_data = iocb->aio_data;
-       req->ki_pos = iocb->aio_offset;
-       req->ki_nbytes = iocb->aio_nbytes;
  
-       ret = aio_run_iocb(req, iocb->aio_lio_opcode,
+       ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode,
                           (char __user *)(unsigned long)iocb->aio_buf,
+                          iocb->aio_nbytes,
                           compat);
        if (ret)
                goto out_put_req;
@@@ -1643,10 -1660,10 +1660,10 @@@ SYSCALL_DEFINE3(io_submit, aio_context_
  /* lookup_kiocb
   *    Finds a given iocb for cancellation.
   */
- static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
                                u32 key)
+ static struct aio_kiocb *
lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
  {
-       struct list_head *pos;
+       struct aio_kiocb *kiocb;
  
        assert_spin_locked(&ctx->ctx_lock);
  
                return NULL;
  
        /* TODO: use a hash or array, this sucks. */
-       list_for_each(pos, &ctx->active_reqs) {
-               struct kiocb *kiocb = list_kiocb(pos);
-               if (kiocb->ki_obj.user == iocb)
+       list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
+               if (kiocb->ki_user_iocb == iocb)
                        return kiocb;
        }
        return NULL;
@@@ -1676,7 -1692,7 +1692,7 @@@ SYSCALL_DEFINE3(io_cancel, aio_context_
                struct io_event __user *, result)
  {
        struct kioctx *ctx;
-       struct kiocb *kiocb;
+       struct aio_kiocb *kiocb;
        u32 key;
        int ret;
  
diff --combined fs/btrfs/file.c
@@@ -24,7 -24,6 +24,6 @@@
  #include <linux/string.h>
  #include <linux/backing-dev.h>
  #include <linux/mpage.h>
- #include <linux/aio.h>
  #include <linux/falloc.h>
  #include <linux/swap.h>
  #include <linux/writeback.h>
@@@ -32,6 -31,7 +31,7 @@@
  #include <linux/compat.h>
  #include <linux/slab.h>
  #include <linux/btrfs.h>
+ #include <linux/uio.h>
  #include "ctree.h"
  #include "disk-io.h"
  #include "transaction.h"
@@@ -1811,10 -1811,22 +1811,10 @@@ static ssize_t btrfs_file_write_iter(st
        mutex_unlock(&inode->i_mutex);
  
        /*
 -       * we want to make sure fsync finds this change
 -       * but we haven't joined a transaction running right now.
 -       *
 -       * Later on, someone is sure to update the inode and get the
 -       * real transid recorded.
 -       *
 -       * We set last_trans now to the fs_info generation + 1,
 -       * this will either be one more than the running transaction
 -       * or the generation used for the next transaction if there isn't
 -       * one running right now.
 -       *
         * We also have to set last_sub_trans to the current log transid,
         * otherwise subsequent syncs to a file that's been synced in this
         * transaction will appear to have already occured.
         */
 -      BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
        BTRFS_I(inode)->last_sub_trans = root->log_transid;
        if (num_written > 0) {
                err = generic_write_sync(file, pos, num_written);
@@@ -1947,37 -1959,25 +1947,37 @@@ int btrfs_sync_file(struct file *file, 
        atomic_inc(&root->log_batch);
  
        /*
 -       * check the transaction that last modified this inode
 -       * and see if its already been committed
 -       */
 -      if (!BTRFS_I(inode)->last_trans) {
 -              mutex_unlock(&inode->i_mutex);
 -              goto out;
 -      }
 -
 -      /*
 -       * if the last transaction that changed this file was before
 -       * the current transaction, we can bail out now without any
 -       * syncing
 +       * If the last transaction that changed this file was before the current
 +       * transaction and we have the full sync flag set in our inode, we can
 +       * bail out now without any syncing.
 +       *
 +       * Note that we can't bail out if the full sync flag isn't set. This is
 +       * because when the full sync flag is set we start all ordered extents
 +       * and wait for them to fully complete - when they complete they update
 +       * the inode's last_trans field through:
 +       *
 +       *     btrfs_finish_ordered_io() ->
 +       *         btrfs_update_inode_fallback() ->
 +       *             btrfs_update_inode() ->
 +       *                 btrfs_set_inode_last_trans()
 +       *
 +       * So we are sure that last_trans is up to date and can do this check to
 +       * bail out safely. For the fast path, when the full sync flag is not
 +       * set in our inode, we can not do it because we start only our ordered
 +       * extents and don't wait for them to complete (that is when
 +       * btrfs_finish_ordered_io runs), so here at this point their last_trans
 +       * value might be less than or equals to fs_info->last_trans_committed,
 +       * and setting a speculative last_trans for an inode when a buffered
 +       * write is made (such as fs_info->generation + 1 for example) would not
 +       * be reliable since after setting the value and before fsync is called
 +       * any number of transactions can start and commit (transaction kthread
 +       * commits the current transaction periodically), and a transaction
 +       * commit does not start nor waits for ordered extents to complete.
         */
        smp_mb();
        if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
 -          BTRFS_I(inode)->last_trans <=
 -          root->fs_info->last_trans_committed) {
 -              BTRFS_I(inode)->last_trans = 0;
 -
 +          (full_sync && BTRFS_I(inode)->last_trans <=
 +           root->fs_info->last_trans_committed)) {
                /*
                 * We'v had everything committed since the last time we were
                 * modified so clear this flag in case it was set for whatever
@@@ -2275,8 -2275,6 +2275,8 @@@ static int btrfs_punch_hole(struct inod
        bool same_page;
        bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
        u64 ino_size;
 +      bool truncated_page = false;
 +      bool updated_inode = false;
  
        ret = btrfs_wait_ordered_range(inode, offset, len);
        if (ret)
         * entire page.
         */
        if (same_page && len < PAGE_CACHE_SIZE) {
 -              if (offset < ino_size)
 +              if (offset < ino_size) {
 +                      truncated_page = true;
                        ret = btrfs_truncate_page(inode, offset, len, 0);
 +              } else {
 +                      ret = 0;
 +              }
                goto out_only_mutex;
        }
  
        /* zero back part of the first page */
        if (offset < ino_size) {
 +              truncated_page = true;
                ret = btrfs_truncate_page(inode, offset, 0, 0);
                if (ret) {
                        mutex_unlock(&inode->i_mutex);
                if (!ret) {
                        /* zero the front end of the last page */
                        if (tail_start + tail_len < ino_size) {
 +                              truncated_page = true;
                                ret = btrfs_truncate_page(inode,
                                                tail_start + tail_len, 0, 1);
                                if (ret)
        }
  
        if (lockend < lockstart) {
 -              mutex_unlock(&inode->i_mutex);
 -              return 0;
 +              ret = 0;
 +              goto out_only_mutex;
        }
  
        while (1) {
@@@ -2514,7 -2506,6 +2514,7 @@@ out_trans
  
        trans->block_rsv = &root->fs_info->trans_block_rsv;
        ret = btrfs_update_inode(trans, root, inode);
 +      updated_inode = true;
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
  out_free:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                             &cached_state, GFP_NOFS);
  out_only_mutex:
 +      if (!updated_inode && truncated_page && !ret && !err) {
 +              /*
 +               * If we only end up zeroing part of a page, we still need to
 +               * update the inode item, so that all the time fields are
 +               * updated as well as the necessary btrfs inode in memory fields
 +               * for detecting, at fsync time, if the inode isn't yet in the
 +               * log tree or it's there but not up to date.
 +               */
 +              trans = btrfs_start_transaction(root, 1);
 +              if (IS_ERR(trans)) {
 +                      err = PTR_ERR(trans);
 +              } else {
 +                      err = btrfs_update_inode(trans, root, inode);
 +                      ret = btrfs_end_transaction(trans, root);
 +              }
 +      }
        mutex_unlock(&inode->i_mutex);
        if (ret && !err)
                err = ret;
diff --combined fs/btrfs/inode.c
@@@ -32,7 -32,6 +32,6 @@@
  #include <linux/writeback.h>
  #include <linux/statfs.h>
  #include <linux/compat.h>
- #include <linux/aio.h>
  #include <linux/bit_spinlock.h>
  #include <linux/xattr.h>
  #include <linux/posix_acl.h>
@@@ -43,6 -42,7 +42,7 @@@
  #include <linux/btrfs.h>
  #include <linux/blkdev.h>
  #include <linux/posix_acl_xattr.h>
+ #include <linux/uio.h>
  #include "ctree.h"
  #include "disk-io.h"
  #include "transaction.h"
@@@ -108,13 -108,6 +108,13 @@@ static struct extent_map *create_pinned
  
  static int btrfs_dirty_inode(struct inode *inode);
  
 +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 +void btrfs_test_inode_set_ops(struct inode *inode)
 +{
 +      BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 +}
 +#endif
 +
  static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                     struct inode *inode,  struct inode *dir,
                                     const struct qstr *qstr)
@@@ -1537,32 -1530,10 +1537,32 @@@ static int run_delalloc_range(struct in
  static void btrfs_split_extent_hook(struct inode *inode,
                                    struct extent_state *orig, u64 split)
  {
 +      u64 size;
 +
        /* not delalloc, ignore it */
        if (!(orig->state & EXTENT_DELALLOC))
                return;
  
 +      size = orig->end - orig->start + 1;
 +      if (size > BTRFS_MAX_EXTENT_SIZE) {
 +              u64 num_extents;
 +              u64 new_size;
 +
 +              /*
 +               * See the explanation in btrfs_merge_extent_hook, the same
 +               * applies here, just in reverse.
 +               */
 +              new_size = orig->end - split + 1;
 +              num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
 +                                      BTRFS_MAX_EXTENT_SIZE);
 +              new_size = split - orig->start;
 +              num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
 +                                      BTRFS_MAX_EXTENT_SIZE);
 +              if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
 +                            BTRFS_MAX_EXTENT_SIZE) >= num_extents)
 +                      return;
 +      }
 +
        spin_lock(&BTRFS_I(inode)->lock);
        BTRFS_I(inode)->outstanding_extents++;
        spin_unlock(&BTRFS_I(inode)->lock);
@@@ -1578,55 -1549,10 +1578,55 @@@ static void btrfs_merge_extent_hook(str
                                    struct extent_state *new,
                                    struct extent_state *other)
  {
 +      u64 new_size, old_size;
 +      u64 num_extents;
 +
        /* not delalloc, ignore it */
        if (!(other->state & EXTENT_DELALLOC))
                return;
  
 +      if (new->start > other->start)
 +              new_size = new->end - other->start + 1;
 +      else
 +              new_size = other->end - new->start + 1;
 +
 +      /* we're not bigger than the max, unreserve the space and go */
 +      if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
 +              spin_lock(&BTRFS_I(inode)->lock);
 +              BTRFS_I(inode)->outstanding_extents--;
 +              spin_unlock(&BTRFS_I(inode)->lock);
 +              return;
 +      }
 +
 +      /*
 +       * We have to add up either side to figure out how many extents were
 +       * accounted for before we merged into one big extent.  If the number of
 +       * extents we accounted for is <= the amount we need for the new range
 +       * then we can return, otherwise drop.  Think of it like this
 +       *
 +       * [ 4k][MAX_SIZE]
 +       *
 +       * So we've grown the extent by a MAX_SIZE extent, this would mean we
 +       * need 2 outstanding extents, on one side we have 1 and the other side
 +       * we have 1 so they are == and we can return.  But in this case
 +       *
 +       * [MAX_SIZE+4k][MAX_SIZE+4k]
 +       *
 +       * Each range on their own accounts for 2 extents, but merged together
 +       * they are only 3 extents worth of accounting, so we need to drop in
 +       * this case.
 +       */
 +      old_size = other->end - other->start + 1;
 +      num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
 +                              BTRFS_MAX_EXTENT_SIZE);
 +      old_size = new->end - new->start + 1;
 +      num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
 +                               BTRFS_MAX_EXTENT_SIZE);
 +
 +      if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
 +                    BTRFS_MAX_EXTENT_SIZE) >= num_extents)
 +              return;
 +
        spin_lock(&BTRFS_I(inode)->lock);
        BTRFS_I(inode)->outstanding_extents--;
        spin_unlock(&BTRFS_I(inode)->lock);
@@@ -1678,7 -1604,7 +1678,7 @@@ static void btrfs_del_delalloc_inode(st
   * have pending delalloc work to be done.
   */
  static void btrfs_set_bit_hook(struct inode *inode,
 -                             struct extent_state *state, unsigned long *bits)
 +                             struct extent_state *state, unsigned *bits)
  {
  
        if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
  
 +              /* For sanity tests */
 +              if (btrfs_test_is_dummy_root(root))
 +                      return;
 +
                __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
                                     root->fs_info->delalloc_batch);
                spin_lock(&BTRFS_I(inode)->lock);
   */
  static void btrfs_clear_bit_hook(struct inode *inode,
                                 struct extent_state *state,
 -                               unsigned long *bits)
 +                               unsigned *bits)
  {
        u64 len = state->end + 1 - state->start;
 +      u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
 +                                  BTRFS_MAX_EXTENT_SIZE);
  
        spin_lock(&BTRFS_I(inode)->lock);
        if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
                        *bits &= ~EXTENT_FIRST_DELALLOC;
                } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
                        spin_lock(&BTRFS_I(inode)->lock);
 -                      BTRFS_I(inode)->outstanding_extents--;
 +                      BTRFS_I(inode)->outstanding_extents -= num_extents;
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
  
                    root != root->fs_info->tree_root)
                        btrfs_delalloc_release_metadata(inode, len);
  
 +              /* For sanity tests. */
 +              if (btrfs_test_is_dummy_root(root))
 +                      return;
 +
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
                    && do_list && !(state->state & EXTENT_NORESERVE))
                        btrfs_free_reserved_data_space(inode, len);
@@@ -3029,7 -2945,7 +3029,7 @@@ static int __readpage_endio_check(struc
        return 0;
  zeroit:
        if (__ratelimit(&_rs))
 -              btrfs_info(BTRFS_I(inode)->root->fs_info,
 +              btrfs_warn(BTRFS_I(inode)->root->fs_info,
                           "csum failed ino %llu off %llu csum %u expected csum %u",
                           btrfs_ino(inode), start, csum, csum_expected);
        memset(kaddr + pgoff, 1, len);
@@@ -3491,7 -3407,7 +3491,7 @@@ int btrfs_orphan_cleanup(struct btrfs_r
  
  out:
        if (ret)
 -              btrfs_crit(root->fs_info,
 +              btrfs_err(root->fs_info,
                        "could not do orphan cleanup %d", ret);
        btrfs_free_path(path);
        return ret;
@@@ -3574,6 -3490,7 +3574,6 @@@ static void btrfs_read_locked_inode(str
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_inode_item *inode_item;
 -      struct btrfs_timespec *tspec;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key location;
        unsigned long ptr;
        i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
        btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
  
 -      tspec = btrfs_inode_atime(inode_item);
 -      inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
 -      inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
 +      inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
 +      inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
  
 -      tspec = btrfs_inode_mtime(inode_item);
 -      inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
 -      inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
 +      inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
 +      inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
  
 -      tspec = btrfs_inode_ctime(inode_item);
 -      inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
 -      inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
 +      inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
 +      inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
 +
 +      BTRFS_I(inode)->i_otime.tv_sec =
 +              btrfs_timespec_sec(leaf, &inode_item->otime);
 +      BTRFS_I(inode)->i_otime.tv_nsec =
 +              btrfs_timespec_nsec(leaf, &inode_item->otime);
  
        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@@ -3741,26 -3656,21 +3741,26 @@@ static void fill_inode_item(struct btrf
        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
  
 -      btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
 +      btrfs_set_token_timespec_sec(leaf, &item->atime,
                                     inode->i_atime.tv_sec, &token);
 -      btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
 +      btrfs_set_token_timespec_nsec(leaf, &item->atime,
                                      inode->i_atime.tv_nsec, &token);
  
 -      btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
 +      btrfs_set_token_timespec_sec(leaf, &item->mtime,
                                     inode->i_mtime.tv_sec, &token);
 -      btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
 +      btrfs_set_token_timespec_nsec(leaf, &item->mtime,
                                      inode->i_mtime.tv_nsec, &token);
  
 -      btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
 +      btrfs_set_token_timespec_sec(leaf, &item->ctime,
                                     inode->i_ctime.tv_sec, &token);
 -      btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
 +      btrfs_set_token_timespec_nsec(leaf, &item->ctime,
                                      inode->i_ctime.tv_nsec, &token);
  
 +      btrfs_set_token_timespec_sec(leaf, &item->otime,
 +                                   BTRFS_I(inode)->i_otime.tv_sec, &token);
 +      btrfs_set_token_timespec_nsec(leaf, &item->otime,
 +                                    BTRFS_I(inode)->i_otime.tv_nsec, &token);
 +
        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
                                     &token);
        btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
@@@ -5097,7 -5007,6 +5097,7 @@@ static int fixup_tree_root_location(str
        struct btrfs_root *new_root;
        struct btrfs_root_ref *ref;
        struct extent_buffer *leaf;
 +      struct btrfs_key key;
        int ret;
        int err = 0;
  
        }
  
        err = -ENOENT;
 -      ret = btrfs_find_item(root->fs_info->tree_root, path,
 -                              BTRFS_I(dir)->root->root_key.objectid,
 -                              location->objectid, BTRFS_ROOT_REF_KEY, NULL);
 +      key.objectid = BTRFS_I(dir)->root->root_key.objectid;
 +      key.type = BTRFS_ROOT_REF_KEY;
 +      key.offset = location->objectid;
 +
 +      ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
 +                              0, 0);
        if (ret) {
                if (ret < 0)
                        err = ret;
@@@ -5352,10 -5258,7 +5352,10 @@@ static struct inode *new_simple_dir(str
        inode->i_op = &btrfs_dir_ro_inode_operations;
        inode->i_fop = &simple_dir_operations;
        inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
 -      inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 +      inode->i_mtime = CURRENT_TIME;
 +      inode->i_atime = inode->i_mtime;
 +      inode->i_ctime = inode->i_mtime;
 +      BTRFS_I(inode)->i_otime = inode->i_mtime;
  
        return inode;
  }
@@@ -5923,12 -5826,7 +5923,12 @@@ static struct inode *btrfs_new_inode(st
  
        inode_init_owner(inode, dir, mode);
        inode_set_bytes(inode, 0);
 -      inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 +
 +      inode->i_mtime = CURRENT_TIME;
 +      inode->i_atime = inode->i_mtime;
 +      inode->i_ctime = inode->i_mtime;
 +      BTRFS_I(inode)->i_otime = inode->i_mtime;
 +
        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                  struct btrfs_inode_item);
        memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
@@@ -7236,28 -7134,17 +7236,28 @@@ static int btrfs_get_blocks_direct(stru
        u64 start = iblock << inode->i_blkbits;
        u64 lockstart, lockend;
        u64 len = bh_result->b_size;
 +      u64 *outstanding_extents = NULL;
        int unlock_bits = EXTENT_LOCKED;
        int ret = 0;
  
        if (create)
 -              unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
 +              unlock_bits |= EXTENT_DIRTY;
        else
                len = min_t(u64, len, root->sectorsize);
  
        lockstart = start;
        lockend = start + len - 1;
  
 +      if (current->journal_info) {
 +              /*
 +               * Need to pull our outstanding extents and set journal_info to NULL so
 +               * that anything that needs to check if there's a transction doesn't get
 +               * confused.
 +               */
 +              outstanding_extents = current->journal_info;
 +              current->journal_info = NULL;
 +      }
 +
        /*
         * If this errors out it's because we couldn't invalidate pagecache for
         * this range and we need to fallback to buffered.
            ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
             em->block_start != EXTENT_MAP_HOLE)) {
                int type;
 -              int ret;
                u64 block_start, orig_start, orig_block_len, ram_bytes;
  
                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
@@@ -7381,21 -7269,14 +7381,21 @@@ unlock
                if (start + len > i_size_read(inode))
                        i_size_write(inode, start + len);
  
 -              spin_lock(&BTRFS_I(inode)->lock);
 -              BTRFS_I(inode)->outstanding_extents++;
 -              spin_unlock(&BTRFS_I(inode)->lock);
 +              /*
 +               * If we have an outstanding_extents count still set then we're
 +               * within our reservation, otherwise we need to adjust our inode
 +               * counter appropriately.
 +               */
 +              if (*outstanding_extents) {
 +                      (*outstanding_extents)--;
 +              } else {
 +                      spin_lock(&BTRFS_I(inode)->lock);
 +                      BTRFS_I(inode)->outstanding_extents++;
 +                      spin_unlock(&BTRFS_I(inode)->lock);
 +              }
  
 -              ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
 -                                   lockstart + len - 1, EXTENT_DELALLOC, NULL,
 -                                   &cached_state, GFP_NOFS);
 -              BUG_ON(ret);
 +              current->journal_info = outstanding_extents;
 +              btrfs_free_reserved_data_space(inode, len);
        }
  
        /*
  unlock_err:
        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
 +      if (outstanding_extents)
 +              current->journal_info = outstanding_extents;
        return ret;
  }
  
@@@ -7926,7 -7805,8 +7926,7 @@@ static int btrfs_submit_direct_hook(in
        }
  
        /* async crcs make it difficult to collect full stripe writes. */
 -      if (btrfs_get_alloc_profile(root, 1) &
 -          (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
 +      if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
                async_submit = 0;
        else
                async_submit = 1;
@@@ -8119,7 -7999,6 +8119,7 @@@ static ssize_t btrfs_direct_IO(int rw, 
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
 +      u64 outstanding_extents = 0;
        size_t count = 0;
        int flags = 0;
        bool wakeup = true;
                ret = btrfs_delalloc_reserve_space(inode, count);
                if (ret)
                        goto out;
 +              outstanding_extents = div64_u64(count +
 +                                              BTRFS_MAX_EXTENT_SIZE - 1,
 +                                              BTRFS_MAX_EXTENT_SIZE);
 +
 +              /*
 +               * We need to know how many extents we reserved so that we can
 +               * do the accounting properly if we go over the number we
 +               * originally calculated.  Abuse current->journal_info for this.
 +               */
 +              current->journal_info = &outstanding_extents;
        } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
                                     &BTRFS_I(inode)->runtime_flags)) {
                inode_dio_done(inode);
                        iter, offset, btrfs_get_blocks_direct, NULL,
                        btrfs_submit_direct, flags);
        if (rw & WRITE) {
 +              current->journal_info = NULL;
                if (ret < 0 && ret != -EIOCBQUEUED)
                        btrfs_delalloc_release_space(inode, count);
                else if (ret >= 0 && (size_t)ret < count)
                        btrfs_delalloc_release_space(inode,
                                                     count - (size_t)ret);
 -              else
 -                      btrfs_delalloc_release_metadata(inode, 0);
        }
  out:
        if (wakeup)
@@@ -8705,9 -8575,6 +8705,9 @@@ struct inode *btrfs_alloc_inode(struct 
  
        ei->delayed_node = NULL;
  
 +      ei->i_otime.tv_sec = 0;
 +      ei->i_otime.tv_nsec = 0;
 +
        inode = &ei->vfs_inode;
        extent_map_tree_init(&ei->extent_tree);
        extent_io_tree_init(&ei->io_tree, &inode->i_data);
diff --combined fs/ceph/file.c
@@@ -7,7 -7,6 +7,6 @@@
  #include <linux/mount.h>
  #include <linux/namei.h>
  #include <linux/writeback.h>
- #include <linux/aio.h>
  #include <linux/falloc.h>
  
  #include "super.h"
@@@ -275,10 -274,10 +274,10 @@@ int ceph_atomic_open(struct inode *dir
        err = ceph_mdsc_do_request(mdsc,
                                   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
                                   req);
 +      err = ceph_handle_snapdir(req, dentry, err);
        if (err)
                goto out_req;
  
 -      err = ceph_handle_snapdir(req, dentry, err);
        if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
                err = ceph_handle_notrace_create(dir, dentry);
  
        }
        if (err)
                goto out_req;
 -      if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) {
 +      if (dn || dentry->d_inode == NULL || d_is_symlink(dentry)) {
                /* make vfs retry on splice, ENOENT, or symlink */
                dout("atomic_open finish_no_open on dn %p\n", dn);
                err = finish_no_open(file, dn);
@@@ -392,14 -391,13 +391,14 @@@ more
        if (ret >= 0) {
                int didpages;
                if (was_short && (pos + ret < inode->i_size)) {
 -                      u64 tmp = min(this_len - ret,
 -                                      inode->i_size - pos - ret);
 +                      int zlen = min(this_len - ret,
 +                                     inode->i_size - pos - ret);
 +                      int zoff = (o_direct ? buf_align : io_align) +
 +                                  read + ret;
                        dout(" zero gap %llu to %llu\n",
 -                              pos + ret, pos + ret + tmp);
 -                      ceph_zero_page_vector_range(page_align + read + ret,
 -                                                      tmp, pages);
 -                      ret += tmp;
 +                              pos + ret, pos + ret + zlen);
 +                      ceph_zero_page_vector_range(zoff, zlen, pages);
 +                      ret += zlen;
                }
  
                didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
@@@ -808,7 -806,7 +807,7 @@@ static ssize_t ceph_read_iter(struct ki
  {
        struct file *filp = iocb->ki_filp;
        struct ceph_file_info *fi = filp->private_data;
-       size_t len = iocb->ki_nbytes;
+       size_t len = iov_iter_count(to);
        struct inode *inode = file_inode(filp);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct page *pinned_page = NULL;
@@@ -879,34 -877,28 +878,34 @@@ again
  
                i_size = i_size_read(inode);
                if (retry_op == READ_INLINE) {
 -                      /* does not support inline data > PAGE_SIZE */
 -                      if (i_size > PAGE_CACHE_SIZE) {
 -                              ret = -EIO;
 -                      } else if (iocb->ki_pos < i_size) {
 +                      BUG_ON(ret > 0 || read > 0);
 +                      if (iocb->ki_pos < i_size &&
 +                          iocb->ki_pos < PAGE_CACHE_SIZE) {
                                loff_t end = min_t(loff_t, i_size,
                                                   iocb->ki_pos + len);
 +                              end = min_t(loff_t, end, PAGE_CACHE_SIZE);
                                if (statret < end)
                                        zero_user_segment(page, statret, end);
                                ret = copy_page_to_iter(page,
                                                iocb->ki_pos & ~PAGE_MASK,
                                                end - iocb->ki_pos, to);
                                iocb->ki_pos += ret;
 -                      } else {
 -                              ret = 0;
 +                              read += ret;
 +                      }
 +                      if (iocb->ki_pos < i_size && read < len) {
 +                              size_t zlen = min_t(size_t, len - read,
 +                                                  i_size - iocb->ki_pos);
 +                              ret = iov_iter_zero(zlen, to);
 +                              iocb->ki_pos += ret;
 +                              read += ret;
                        }
                        __free_pages(page, 0);
 -                      return ret;
 +                      return read;
                }
  
                /* hit EOF or hole? */
                if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
 -                      ret < len) {
 +                  ret < len) {
                        dout("sync_read hit hole, ppos %lld < size %lld"
                             ", reading more\n", iocb->ki_pos,
                             inode->i_size);
diff --combined fs/ecryptfs/file.c
@@@ -31,7 -31,6 +31,6 @@@
  #include <linux/security.h>
  #include <linux/compat.h>
  #include <linux/fs_stack.h>
- #include <linux/aio.h>
  #include "ecryptfs_kernel.h"
  
  /**
@@@ -52,12 -51,6 +51,6 @@@ static ssize_t ecryptfs_read_update_ati
        struct file *file = iocb->ki_filp;
  
        rc = generic_file_read_iter(iocb, to);
-       /*
-        * Even though this is a async interface, we need to wait
-        * for IO to finish to update atime
-        */
-       if (-EIOCBQUEUED == rc)
-               rc = wait_on_sync_kiocb(iocb);
        if (rc >= 0) {
                path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
                touch_atime(path);
@@@ -230,7 -223,7 +223,7 @@@ static int ecryptfs_open(struct inode *
        }
        ecryptfs_set_file_lower(
                file, ecryptfs_inode_to_private(inode)->lower_file);
 -      if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
 +      if (d_is_dir(ecryptfs_dentry)) {
                ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
                mutex_lock(&crypt_stat->cs_mutex);
                crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
@@@ -303,22 -296,9 +296,22 @@@ ecryptfs_unlocked_ioctl(struct file *fi
        struct file *lower_file = ecryptfs_file_to_lower(file);
        long rc = -ENOTTY;
  
 -      if (lower_file->f_op->unlocked_ioctl)
 +      if (!lower_file->f_op->unlocked_ioctl)
 +              return rc;
 +
 +      switch (cmd) {
 +      case FITRIM:
 +      case FS_IOC_GETFLAGS:
 +      case FS_IOC_SETFLAGS:
 +      case FS_IOC_GETVERSION:
 +      case FS_IOC_SETVERSION:
                rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
 -      return rc;
 +              fsstack_copy_attr_all(file_inode(file), file_inode(lower_file));
 +
 +              return rc;
 +      default:
 +              return rc;
 +      }
  }
  
  #ifdef CONFIG_COMPAT
@@@ -328,22 -308,9 +321,22 @@@ ecryptfs_compat_ioctl(struct file *file
        struct file *lower_file = ecryptfs_file_to_lower(file);
        long rc = -ENOIOCTLCMD;
  
 -      if (lower_file->f_op->compat_ioctl)
 +      if (!lower_file->f_op->compat_ioctl)
 +              return rc;
 +
 +      switch (cmd) {
 +      case FITRIM:
 +      case FS_IOC32_GETFLAGS:
 +      case FS_IOC32_SETFLAGS:
 +      case FS_IOC32_GETVERSION:
 +      case FS_IOC32_SETVERSION:
                rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
 -      return rc;
 +              fsstack_copy_attr_all(file_inode(file), file_inode(lower_file));
 +
 +              return rc;
 +      default:
 +              return rc;
 +      }
  }
  #endif
  
diff --combined fs/ext4/indirect.c
@@@ -20,9 -20,9 +20,9 @@@
   *    (sct@redhat.com), 1993, 1998
   */
  
- #include <linux/aio.h>
  #include "ext4_jbd2.h"
  #include "truncate.h"
+ #include <linux/uio.h>
  
  #include <trace/events/ext4.h>
  
@@@ -1401,7 -1401,10 +1401,7 @@@ end_range
                                 * to free. Everything was covered by the start
                                 * of the range.
                                 */
 -                              return 0;
 -                      } else {
 -                              /* Shared branch grows from an indirect block */
 -                              partial2--;
 +                              goto do_indirects;
                        }
                } else {
                        /*
        /* Punch happened within the same level (n == n2) */
        partial = ext4_find_shared(inode, n, offsets, chain, &nr);
        partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
 -      /*
 -       * ext4_find_shared returns Indirect structure which
 -       * points to the last element which should not be
 -       * removed by truncate. But this is end of the range
 -       * in punch_hole so we need to point to the next element
 -       */
 -      partial2->p++;
 -      while ((partial > chain) || (partial2 > chain2)) {
 -              /* We're at the same block, so we're almost finished */
 -              if ((partial->bh && partial2->bh) &&
 -                  (partial->bh->b_blocknr == partial2->bh->b_blocknr)) {
 -                      if ((partial > chain) && (partial2 > chain2)) {
 +
 +      /* Free top, but only if partial2 isn't its subtree. */
 +      if (nr) {
 +              int level = min(partial - chain, partial2 - chain2);
 +              int i;
 +              int subtree = 1;
 +
 +              for (i = 0; i <= level; i++) {
 +                      if (offsets[i] != offsets2[i]) {
 +                              subtree = 0;
 +                              break;
 +                      }
 +              }
 +
 +              if (!subtree) {
 +                      if (partial == chain) {
 +                              /* Shared branch grows from the inode */
 +                              ext4_free_branches(handle, inode, NULL,
 +                                                 &nr, &nr+1,
 +                                                 (chain+n-1) - partial);
 +                              *partial->p = 0;
 +                      } else {
 +                              /* Shared branch grows from an indirect block */
 +                              BUFFER_TRACE(partial->bh, "get_write_access");
                                ext4_free_branches(handle, inode, partial->bh,
 -                                                 partial->p + 1,
 -                                                 partial2->p,
 +                                                 partial->p,
 +                                                 partial->p+1,
                                                   (chain+n-1) - partial);
 -                              BUFFER_TRACE(partial->bh, "call brelse");
 -                              brelse(partial->bh);
 -                              BUFFER_TRACE(partial2->bh, "call brelse");
 -                              brelse(partial2->bh);
                        }
 -                      return 0;
                }
 +      }
 +
 +      if (!nr2) {
                /*
 -               * Clear the ends of indirect blocks on the shared branch
 -               * at the start of the range
 +               * ext4_find_shared returns Indirect structure which
 +               * points to the last element which should not be
 +               * removed by truncate. But this is end of the range
 +               * in punch_hole so we need to point to the next element
                 */
 -              if (partial > chain) {
 +              partial2->p++;
 +      }
 +
 +      while (partial > chain || partial2 > chain2) {
 +              int depth = (chain+n-1) - partial;
 +              int depth2 = (chain2+n2-1) - partial2;
 +
 +              if (partial > chain && partial2 > chain2 &&
 +                  partial->bh->b_blocknr == partial2->bh->b_blocknr) {
 +                      /*
 +                       * We've converged on the same block. Clear the range,
 +                       * then we're done.
 +                       */
                        ext4_free_branches(handle, inode, partial->bh,
 -                                 partial->p + 1,
 -                                 (__le32 *)partial->bh->b_data+addr_per_block,
 -                                 (chain+n-1) - partial);
 +                                         partial->p + 1,
 +                                         partial2->p,
 +                                         (chain+n-1) - partial);
                        BUFFER_TRACE(partial->bh, "call brelse");
                        brelse(partial->bh);
 -                      partial--;
 +                      BUFFER_TRACE(partial2->bh, "call brelse");
 +                      brelse(partial2->bh);
 +                      return 0;
                }
 +
                /*
 -               * Clear the ends of indirect blocks on the shared branch
 -               * at the end of the range
 +               * The start and end partial branches may not be at the same
 +               * level even though the punch happened within one level. So, we
 +               * give them a chance to arrive at the same level, then walk
 +               * them in step with each other until we converge on the same
 +               * block.
                 */
 -              if (partial2 > chain2) {
 +              if (partial > chain && depth <= depth2) {
 +                      ext4_free_branches(handle, inode, partial->bh,
 +                                         partial->p + 1,
 +                                         (__le32 *)partial->bh->b_data+addr_per_block,
 +                                         (chain+n-1) - partial);
 +                      BUFFER_TRACE(partial->bh, "call brelse");
 +                      brelse(partial->bh);
 +                      partial--;
 +              }
 +              if (partial2 > chain2 && depth2 <= depth) {
                        ext4_free_branches(handle, inode, partial2->bh,
                                           (__le32 *)partial2->bh->b_data,
                                           partial2->p,
 -                                         (chain2+n-1) - partial2);
 +                                         (chain2+n2-1) - partial2);
                        BUFFER_TRACE(partial2->bh, "call brelse");
                        brelse(partial2->bh);
                        partial2--;
                }
        }
 +      return 0;
  
  do_indirects:
        /* Kill the remaining (whole) subtrees */
diff --combined fs/ext4/inode.c
@@@ -37,7 -37,6 +37,6 @@@
  #include <linux/printk.h>
  #include <linux/slab.h>
  #include <linux/ratelimit.h>
- #include <linux/aio.h>
  #include <linux/bitops.h>
  
  #include "ext4_jbd2.h"
@@@ -1024,7 -1023,6 +1023,7 @@@ static int ext4_write_end(struct file *
  {
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
 +      loff_t old_size = inode->i_size;
        int ret = 0, ret2;
        int i_size_changed = 0;
  
        unlock_page(page);
        page_cache_release(page);
  
 +      if (old_size < pos)
 +              pagecache_isize_extended(inode, old_size, pos);
        /*
         * Don't mark the inode dirty under page lock. First, it unnecessarily
         * makes the holding time of page lock longer. Second, it forces lock
@@@ -1098,7 -1094,6 +1097,7 @@@ static int ext4_journalled_write_end(st
  {
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
 +      loff_t old_size = inode->i_size;
        int ret = 0, ret2;
        int partial = 0;
        unsigned from, to;
        unlock_page(page);
        page_cache_release(page);
  
 +      if (old_size < pos)
 +              pagecache_isize_extended(inode, old_size, pos);
 +
        if (size_changed) {
                ret2 = ext4_mark_inode_dirty(handle, inode);
                if (!ret)
diff --combined fs/fuse/dev.c
@@@ -19,7 -19,6 +19,6 @@@
  #include <linux/pipe_fs_i.h>
  #include <linux/swap.h>
  #include <linux/splice.h>
- #include <linux/aio.h>
  
  MODULE_ALIAS_MISCDEV(FUSE_MINOR);
  MODULE_ALIAS("devname:fuse");
@@@ -890,8 -889,8 +889,8 @@@ static int fuse_try_move_page(struct fu
  
        newpage = buf->page;
  
 -      if (WARN_ON(!PageUptodate(newpage)))
 -              return -EIO;
 +      if (!PageUptodate(newpage))
 +              SetPageUptodate(newpage);
  
        ClearPageMappedToDisk(newpage);
  
@@@ -1353,17 -1352,6 +1352,17 @@@ static ssize_t fuse_dev_do_read(struct 
        return err;
  }
  
 +static int fuse_dev_open(struct inode *inode, struct file *file)
 +{
 +      /*
 +       * The fuse device's file's private_data is used to hold
 +       * the fuse_conn(ection) when it is mounted, and is used to
 +       * keep track of whether the file has been mounted already.
 +       */
 +      file->private_data = NULL;
 +      return 0;
 +}
 +
  static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
                              unsigned long nr_segs, loff_t pos)
  {
@@@ -1808,9 -1796,6 +1807,9 @@@ copy_finish
  static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
                       unsigned int size, struct fuse_copy_state *cs)
  {
 +      /* Don't try to move pages (yet) */
 +      cs->move_pages = 0;
 +
        switch (code) {
        case FUSE_NOTIFY_POLL:
                return fuse_notify_poll(fc, size, cs);
@@@ -2231,7 -2216,6 +2230,7 @@@ static int fuse_dev_fasync(int fd, stru
  
  const struct file_operations fuse_dev_operations = {
        .owner          = THIS_MODULE,
 +      .open           = fuse_dev_open,
        .llseek         = no_llseek,
        .read           = do_sync_read,
        .aio_read       = fuse_dev_read,
diff --combined fs/nfs/direct.c
@@@ -265,7 -265,7 +265,7 @@@ ssize_t nfs_direct_IO(int rw, struct ki
  
        return -EINVAL;
  #else
-       VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
+       VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
  
        if (rw == READ)
                return nfs_file_direct_read(iocb, iter, pos);
@@@ -283,7 -283,7 +283,7 @@@ static void nfs_direct_release_pages(st
  void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
                              struct nfs_direct_req *dreq)
  {
 -      cinfo->lock = &dreq->lock;
 +      cinfo->lock = &dreq->inode->i_lock;
        cinfo->mds = &dreq->mds_cinfo;
        cinfo->ds = &dreq->ds_cinfo;
        cinfo->dreq = dreq;
@@@ -393,7 -393,7 +393,7 @@@ static void nfs_direct_complete(struct 
                long res = (long) dreq->error;
                if (!res)
                        res = (long) dreq->count;
-               aio_complete(dreq->iocb, res, 0);
+               dreq->iocb->ki_complete(dreq->iocb, res, 0);
        }
  
        complete_all(&dreq->completion);
diff --combined fs/nfs/file.c
@@@ -26,7 -26,6 +26,6 @@@
  #include <linux/nfs_mount.h>
  #include <linux/mm.h>
  #include <linux/pagemap.h>
- #include <linux/aio.h>
  #include <linux/gfp.h>
  #include <linux/swap.h>
  
@@@ -178,7 -177,7 +177,7 @@@ nfs_file_read(struct kiocb *iocb, struc
                iocb->ki_filp,
                iov_iter_count(to), (unsigned long) iocb->ki_pos);
  
 -      result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 +      result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping);
        if (!result) {
                result = generic_file_read_iter(iocb, to);
                if (result > 0)
@@@ -199,7 -198,7 +198,7 @@@ nfs_file_splice_read(struct file *filp
        dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
                filp, (unsigned long) count, (unsigned long long) *ppos);
  
 -      res = nfs_revalidate_mapping(inode, filp->f_mapping);
 +      res = nfs_revalidate_mapping_protected(inode, filp->f_mapping);
        if (!res) {
                res = generic_file_splice_read(filp, ppos, pipe, count, flags);
                if (res > 0)
@@@ -372,10 -371,6 +371,10 @@@ start
                                 nfs_wait_bit_killable, TASK_KILLABLE);
        if (ret)
                return ret;
 +      /*
 +       * Wait for O_DIRECT to complete
 +       */
 +      nfs_inode_dio_wait(mapping->host);
  
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
@@@ -623,9 -618,6 +622,9 @@@ static int nfs_vm_page_mkwrite(struct v
        /* make sure the cache has finished storing the page */
        nfs_fscache_wait_on_page_write(NFS_I(inode), page);
  
 +      wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
 +                      nfs_wait_bit_killable, TASK_KILLABLE);
 +
        lock_page(page);
        mapping = page_file_mapping(page);
        if (mapping != inode->i_mapping)
diff --combined fs/ntfs/file.c
@@@ -1,7 -1,7 +1,7 @@@
  /*
   * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
   *
 - * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
 + * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
   *
   * This program/include file is free software; you can redistribute it and/or
   * modify it under the terms of the GNU General Public License as published
@@@ -28,7 -28,6 +28,6 @@@
  #include <linux/swap.h>
  #include <linux/uio.h>
  #include <linux/writeback.h>
- #include <linux/aio.h>
  
  #include <asm/page.h>
  #include <asm/uaccess.h>
@@@ -329,168 -328,62 +328,168 @@@ err_out
        return err;
  }
  
 -/**
 - * ntfs_fault_in_pages_readable -
 - *
 - * Fault a number of userspace pages into pagetables.
 - *
 - * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
 - * with more than two userspace pages as well as handling the single page case
 - * elegantly.
 - *
 - * If you find this difficult to understand, then think of the while loop being
 - * the following code, except that we do without the integer variable ret:
 - *
 - *    do {
 - *            ret = __get_user(c, uaddr);
 - *            uaddr += PAGE_SIZE;
 - *    } while (!ret && uaddr < end);
 - *
 - * Note, the final __get_user() may well run out-of-bounds of the user buffer,
 - * but _not_ out-of-bounds of the page the user buffer belongs to, and since
 - * this is only a read and not a write, and since it is still in the same page,
 - * it should not matter and this makes the code much simpler.
 - */
 -static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
 -              int bytes)
 +static ssize_t ntfs_prepare_file_for_write(struct file *file, loff_t *ppos,
 +              size_t *count)
  {
 -      const char __user *end;
 -      volatile char c;
 -
 -      /* Set @end to the first byte outside the last page we care about. */
 -      end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
 -
 -      while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
 -              ;
 -}
 -
 -/**
 - * ntfs_fault_in_pages_readable_iovec -
 - *
 - * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
 - */
 -static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
 -              size_t iov_ofs, int bytes)
 -{
 -      do {
 -              const char __user *buf;
 -              unsigned len;
 +      loff_t pos;
 +      s64 end, ll;
 +      ssize_t err;
 +      unsigned long flags;
 +      struct inode *vi = file_inode(file);
 +      ntfs_inode *base_ni, *ni = NTFS_I(vi);
 +      ntfs_volume *vol = ni->vol;
  
 -              buf = iov->iov_base + iov_ofs;
 -              len = iov->iov_len - iov_ofs;
 -              if (len > bytes)
 -                      len = bytes;
 -              ntfs_fault_in_pages_readable(buf, len);
 -              bytes -= len;
 -              iov++;
 -              iov_ofs = 0;
 -      } while (bytes);
 +      ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
 +                      "0x%llx, count 0x%lx.", vi->i_ino,
 +                      (unsigned)le32_to_cpu(ni->type),
 +                      (unsigned long long)*ppos, (unsigned long)*count);
 +      /* We can write back this queue in page reclaim. */
 +      current->backing_dev_info = inode_to_bdi(vi);
 +      err = generic_write_checks(file, ppos, count, S_ISBLK(vi->i_mode));
 +      if (unlikely(err))
 +              goto out;
 +      /*
 +       * All checks have passed.  Before we start doing any writing we want
 +       * to abort any totally illegal writes.
 +       */
 +      BUG_ON(NInoMstProtected(ni));
 +      BUG_ON(ni->type != AT_DATA);
 +      /* If file is encrypted, deny access, just like NT4. */
 +      if (NInoEncrypted(ni)) {
 +              /* Only $DATA attributes can be encrypted. */
 +              /*
 +               * Reminder for later: Encrypted files are _always_
 +               * non-resident so that the content can always be encrypted.
 +               */
 +              ntfs_debug("Denying write access to encrypted file.");
 +              err = -EACCES;
 +              goto out;
 +      }
 +      if (NInoCompressed(ni)) {
 +              /* Only unnamed $DATA attribute can be compressed. */
 +              BUG_ON(ni->name_len);
 +              /*
 +               * Reminder for later: If resident, the data is not actually
 +               * compressed.  Only on the switch to non-resident does
 +               * compression kick in.  This is in contrast to encrypted files
 +               * (see above).
 +               */
 +              ntfs_error(vi->i_sb, "Writing to compressed files is not "
 +                              "implemented yet.  Sorry.");
 +              err = -EOPNOTSUPP;
 +              goto out;
 +      }
 +      if (*count == 0)
 +              goto out;
 +      base_ni = ni;
 +      if (NInoAttr(ni))
 +              base_ni = ni->ext.base_ntfs_ino;
 +      err = file_remove_suid(file);
 +      if (unlikely(err))
 +              goto out;
 +      /*
 +       * Our ->update_time method always succeeds thus file_update_time()
 +       * cannot fail either so there is no need to check the return code.
 +       */
 +      file_update_time(file);
 +      pos = *ppos;
 +      /* The first byte after the last cluster being written to. */
 +      end = (pos + *count + vol->cluster_size_mask) &
 +                      ~(u64)vol->cluster_size_mask;
 +      /*
 +       * If the write goes beyond the allocated size, extend the allocation
 +       * to cover the whole of the write, rounded up to the nearest cluster.
 +       */
 +      read_lock_irqsave(&ni->size_lock, flags);
 +      ll = ni->allocated_size;
 +      read_unlock_irqrestore(&ni->size_lock, flags);
 +      if (end > ll) {
 +              /*
 +               * Extend the allocation without changing the data size.
 +               *
 +               * Note we ensure the allocation is big enough to at least
 +               * write some data but we do not require the allocation to be
 +               * complete, i.e. it may be partial.
 +               */
 +              ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
 +              if (likely(ll >= 0)) {
 +                      BUG_ON(pos >= ll);
 +                      /* If the extension was partial truncate the write. */
 +                      if (end > ll) {
 +                              ntfs_debug("Truncating write to inode 0x%lx, "
 +                                              "attribute type 0x%x, because "
 +                                              "the allocation was only "
 +                                              "partially extended.",
 +                                              vi->i_ino, (unsigned)
 +                                              le32_to_cpu(ni->type));
 +                              *count = ll - pos;
 +                      }
 +              } else {
 +                      err = ll;
 +                      read_lock_irqsave(&ni->size_lock, flags);
 +                      ll = ni->allocated_size;
 +                      read_unlock_irqrestore(&ni->size_lock, flags);
 +                      /* Perform a partial write if possible or fail. */
 +                      if (pos < ll) {
 +                              ntfs_debug("Truncating write to inode 0x%lx "
 +                                              "attribute type 0x%x, because "
 +                                              "extending the allocation "
 +                                              "failed (error %d).",
 +                                              vi->i_ino, (unsigned)
 +                                              le32_to_cpu(ni->type),
 +                                              (int)-err);
 +                              *count = ll - pos;
 +                      } else {
 +                              if (err != -ENOSPC)
 +                                      ntfs_error(vi->i_sb, "Cannot perform "
 +                                                      "write to inode "
 +                                                      "0x%lx, attribute "
 +                                                      "type 0x%x, because "
 +                                                      "extending the "
 +                                                      "allocation failed "
 +                                                      "(error %ld).",
 +                                                      vi->i_ino, (unsigned)
 +                                                      le32_to_cpu(ni->type),
 +                                                      (long)-err);
 +                              else
 +                                      ntfs_debug("Cannot perform write to "
 +                                                      "inode 0x%lx, "
 +                                                      "attribute type 0x%x, "
 +                                                      "because there is not "
 +                                                      "space left.",
 +                                                      vi->i_ino, (unsigned)
 +                                                      le32_to_cpu(ni->type));
 +                              goto out;
 +                      }
 +              }
 +      }
 +      /*
 +       * If the write starts beyond the initialized size, extend it up to the
 +       * beginning of the write and initialize all non-sparse space between
 +       * the old initialized size and the new one.  This automatically also
 +       * increments the vfs inode->i_size to keep it above or equal to the
 +       * initialized_size.
 +       */
 +      read_lock_irqsave(&ni->size_lock, flags);
 +      ll = ni->initialized_size;
 +      read_unlock_irqrestore(&ni->size_lock, flags);
 +      if (pos > ll) {
 +              /*
 +               * Wait for ongoing direct i/o to complete before proceeding.
 +               * New direct i/o cannot start as we hold i_mutex.
 +               */
 +              inode_dio_wait(vi);
 +              err = ntfs_attr_extend_initialized(ni, pos);
 +              if (unlikely(err < 0))
 +                      ntfs_error(vi->i_sb, "Cannot perform write to inode "
 +                                      "0x%lx, attribute type 0x%x, because "
 +                                      "extending the initialized size "
 +                                      "failed (error %d).", vi->i_ino,
 +                                      (unsigned)le32_to_cpu(ni->type),
 +                                      (int)-err);
 +      }
 +out:
 +      return err;
  }
  
  /**
@@@ -527,8 -420,8 +526,8 @@@ static inline int __ntfs_grab_cache_pag
                                        goto err_out;
                                }
                        }
 -                      err = add_to_page_cache_lru(*cached_page, mapping, index,
 -                                      GFP_KERNEL);
 +                      err = add_to_page_cache_lru(*cached_page, mapping,
 +                                      index, GFP_KERNEL);
                        if (unlikely(err)) {
                                if (err == -EEXIST)
                                        continue;
@@@ -1374,6 -1267,180 +1373,6 @@@ rl_not_mapped_enoent
        return err;
  }
  
 -/*
 - * Copy as much as we can into the pages and return the number of bytes which
 - * were successfully copied.  If a fault is encountered then clear the pages
 - * out to (ofs + bytes) and return the number of bytes which were copied.
 - */
 -static inline size_t ntfs_copy_from_user(struct page **pages,
 -              unsigned nr_pages, unsigned ofs, const char __user *buf,
 -              size_t bytes)
 -{
 -      struct page **last_page = pages + nr_pages;
 -      char *addr;
 -      size_t total = 0;
 -      unsigned len;
 -      int left;
 -
 -      do {
 -              len = PAGE_CACHE_SIZE - ofs;
 -              if (len > bytes)
 -                      len = bytes;
 -              addr = kmap_atomic(*pages);
 -              left = __copy_from_user_inatomic(addr + ofs, buf, len);
 -              kunmap_atomic(addr);
 -              if (unlikely(left)) {
 -                      /* Do it the slow way. */
 -                      addr = kmap(*pages);
 -                      left = __copy_from_user(addr + ofs, buf, len);
 -                      kunmap(*pages);
 -                      if (unlikely(left))
 -                              goto err_out;
 -              }
 -              total += len;
 -              bytes -= len;
 -              if (!bytes)
 -                      break;
 -              buf += len;
 -              ofs = 0;
 -      } while (++pages < last_page);
 -out:
 -      return total;
 -err_out:
 -      total += len - left;
 -      /* Zero the rest of the target like __copy_from_user(). */
 -      while (++pages < last_page) {
 -              bytes -= len;
 -              if (!bytes)
 -                      break;
 -              len = PAGE_CACHE_SIZE;
 -              if (len > bytes)
 -                      len = bytes;
 -              zero_user(*pages, 0, len);
 -      }
 -      goto out;
 -}
 -
 -static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
 -              const struct iovec *iov, size_t iov_ofs, size_t bytes)
 -{
 -      size_t total = 0;
 -
 -      while (1) {
 -              const char __user *buf = iov->iov_base + iov_ofs;
 -              unsigned len;
 -              size_t left;
 -
 -              len = iov->iov_len - iov_ofs;
 -              if (len > bytes)
 -                      len = bytes;
 -              left = __copy_from_user_inatomic(vaddr, buf, len);
 -              total += len;
 -              bytes -= len;
 -              vaddr += len;
 -              if (unlikely(left)) {
 -                      total -= left;
 -                      break;
 -              }
 -              if (!bytes)
 -                      break;
 -              iov++;
 -              iov_ofs = 0;
 -      }
 -      return total;
 -}
 -
 -static inline void ntfs_set_next_iovec(const struct iovec **iovp,
 -              size_t *iov_ofsp, size_t bytes)
 -{
 -      const struct iovec *iov = *iovp;
 -      size_t iov_ofs = *iov_ofsp;
 -
 -      while (bytes) {
 -              unsigned len;
 -
 -              len = iov->iov_len - iov_ofs;
 -              if (len > bytes)
 -                      len = bytes;
 -              bytes -= len;
 -              iov_ofs += len;
 -              if (iov->iov_len == iov_ofs) {
 -                      iov++;
 -                      iov_ofs = 0;
 -              }
 -      }
 -      *iovp = iov;
 -      *iov_ofsp = iov_ofs;
 -}
 -
 -/*
 - * This has the same side-effects and return value as ntfs_copy_from_user().
 - * The difference is that on a fault we need to memset the remainder of the
 - * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
 - * single-segment behaviour.
 - *
 - * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
 - * atomic and when not atomic.  This is ok because it calls
 - * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
 - * fact, the only difference between __copy_from_user_inatomic() and
 - * __copy_from_user() is that the latter calls might_sleep() and the former
 - * should not zero the tail of the buffer on error.  And on many architectures
 - * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
 - * makes no difference at all on those architectures.
 - */
 -static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
 -              unsigned nr_pages, unsigned ofs, const struct iovec **iov,
 -              size_t *iov_ofs, size_t bytes)
 -{
 -      struct page **last_page = pages + nr_pages;
 -      char *addr;
 -      size_t copied, len, total = 0;
 -
 -      do {
 -              len = PAGE_CACHE_SIZE - ofs;
 -              if (len > bytes)
 -                      len = bytes;
 -              addr = kmap_atomic(*pages);
 -              copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
 -                              *iov, *iov_ofs, len);
 -              kunmap_atomic(addr);
 -              if (unlikely(copied != len)) {
 -                      /* Do it the slow way. */
 -                      addr = kmap(*pages);
 -                      copied = __ntfs_copy_from_user_iovec_inatomic(addr +
 -                                      ofs, *iov, *iov_ofs, len);
 -                      if (unlikely(copied != len))
 -                              goto err_out;
 -                      kunmap(*pages);
 -              }
 -              total += len;
 -              ntfs_set_next_iovec(iov, iov_ofs, len);
 -              bytes -= len;
 -              if (!bytes)
 -                      break;
 -              ofs = 0;
 -      } while (++pages < last_page);
 -out:
 -      return total;
 -err_out:
 -      BUG_ON(copied > len);
 -      /* Zero the rest of the target like __copy_from_user(). */
 -      memset(addr + ofs + copied, 0, len - copied);
 -      kunmap(*pages);
 -      total += copied;
 -      ntfs_set_next_iovec(iov, iov_ofs, copied);
 -      while (++pages < last_page) {
 -              bytes -= len;
 -              if (!bytes)
 -                      break;
 -              len = PAGE_CACHE_SIZE;
 -              if (len > bytes)
 -                      len = bytes;
 -              zero_user(*pages, 0, len);
 -      }
 -      goto out;
 -}
 -
  static inline void ntfs_flush_dcache_pages(struct page **pages,
                unsigned nr_pages)
  {
@@@ -1694,83 -1761,86 +1693,83 @@@ err_out
        return err;
  }
  
 -static void ntfs_write_failed(struct address_space *mapping, loff_t to)
 +/*
 + * Copy as much as we can into the pages and return the number of bytes which
 + * were successfully copied.  If a fault is encountered then clear the pages
 + * out to (ofs + bytes) and return the number of bytes which were copied.
 + */
 +static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
 +              unsigned ofs, struct iov_iter *i, size_t bytes)
  {
 -      struct inode *inode = mapping->host;
 +      struct page **last_page = pages + nr_pages;
 +      size_t total = 0;
 +      struct iov_iter data = *i;
 +      unsigned len, copied;
  
 -      if (to > inode->i_size) {
 -              truncate_pagecache(inode, inode->i_size);
 -              ntfs_truncate_vfs(inode);
 -      }
 +      do {
 +              len = PAGE_CACHE_SIZE - ofs;
 +              if (len > bytes)
 +                      len = bytes;
 +              copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs,
 +                              len);
 +              total += copied;
 +              bytes -= copied;
 +              if (!bytes)
 +                      break;
 +              iov_iter_advance(&data, copied);
 +              if (copied < len)
 +                      goto err;
 +              ofs = 0;
 +      } while (++pages < last_page);
 +out:
 +      return total;
 +err:
 +      /* Zero the rest of the target like __copy_from_user(). */
 +      len = PAGE_CACHE_SIZE - copied;
 +      do {
 +              if (len > bytes)
 +                      len = bytes;
 +              zero_user(*pages, copied, len);
 +              bytes -= len;
 +              copied = 0;
 +              len = PAGE_CACHE_SIZE;
 +      } while (++pages < last_page);
 +      goto out;
  }
  
  /**
 - * ntfs_file_buffered_write -
 - *
 - * Locking: The vfs is holding ->i_mutex on the inode.
 + * ntfs_perform_write - perform buffered write to a file
 + * @file:     file to write to
 + * @i:                iov_iter with data to write
 + * @pos:      byte offset in file at which to begin writing to
   */
 -static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
 -              const struct iovec *iov, unsigned long nr_segs,
 -              loff_t pos, loff_t *ppos, size_t count)
 +static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
 +              loff_t pos)
  {
 -      struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *vi = mapping->host;
        ntfs_inode *ni = NTFS_I(vi);
        ntfs_volume *vol = ni->vol;
        struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
        struct page *cached_page = NULL;
 -      char __user *buf = NULL;
 -      s64 end, ll;
        VCN last_vcn;
        LCN lcn;
 -      unsigned long flags;
 -      size_t bytes, iov_ofs = 0;      /* Offset in the current iovec. */
 -      ssize_t status, written;
 +      size_t bytes;
 +      ssize_t status, written = 0;
        unsigned nr_pages;
 -      int err;
  
 -      ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
 -                      "pos 0x%llx, count 0x%lx.",
 -                      vi->i_ino, (unsigned)le32_to_cpu(ni->type),
 -                      (unsigned long long)pos, (unsigned long)count);
 -      if (unlikely(!count))
 -              return 0;
 -      BUG_ON(NInoMstProtected(ni));
 -      /*
 -       * If the attribute is not an index root and it is encrypted or
 -       * compressed, we cannot write to it yet.  Note we need to check for
 -       * AT_INDEX_ALLOCATION since this is the type of both directory and
 -       * index inodes.
 -       */
 -      if (ni->type != AT_INDEX_ALLOCATION) {
 -              /* If file is encrypted, deny access, just like NT4. */
 -              if (NInoEncrypted(ni)) {
 -                      /*
 -                       * Reminder for later: Encrypted files are _always_
 -                       * non-resident so that the content can always be
 -                       * encrypted.
 -                       */
 -                      ntfs_debug("Denying write access to encrypted file.");
 -                      return -EACCES;
 -              }
 -              if (NInoCompressed(ni)) {
 -                      /* Only unnamed $DATA attribute can be compressed. */
 -                      BUG_ON(ni->type != AT_DATA);
 -                      BUG_ON(ni->name_len);
 -                      /*
 -                       * Reminder for later: If resident, the data is not
 -                       * actually compressed.  Only on the switch to non-
 -                       * resident does compression kick in.  This is in
 -                       * contrast to encrypted files (see above).
 -                       */
 -                      ntfs_error(vi->i_sb, "Writing to compressed files is "
 -                                      "not implemented yet.  Sorry.");
 -                      return -EOPNOTSUPP;
 -              }
 -      }
 +      ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
 +                      "0x%llx, count 0x%lx.", vi->i_ino,
 +                      (unsigned)le32_to_cpu(ni->type),
 +                      (unsigned long long)pos,
 +                      (unsigned long)iov_iter_count(i));
        /*
         * If a previous ntfs_truncate() failed, repeat it and abort if it
         * fails again.
         */
        if (unlikely(NInoTruncateFailed(ni))) {
 +              int err;
 +
                inode_dio_wait(vi);
                err = ntfs_truncate(vi);
                if (err || NInoTruncateFailed(ni)) {
                        return err;
                }
        }
 -      /* The first byte after the write. */
 -      end = pos + count;
 -      /*
 -       * If the write goes beyond the allocated size, extend the allocation
 -       * to cover the whole of the write, rounded up to the nearest cluster.
 -       */
 -      read_lock_irqsave(&ni->size_lock, flags);
 -      ll = ni->allocated_size;
 -      read_unlock_irqrestore(&ni->size_lock, flags);
 -      if (end > ll) {
 -              /* Extend the allocation without changing the data size. */
 -              ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
 -              if (likely(ll >= 0)) {
 -                      BUG_ON(pos >= ll);
 -                      /* If the extension was partial truncate the write. */
 -                      if (end > ll) {
 -                              ntfs_debug("Truncating write to inode 0x%lx, "
 -                                              "attribute type 0x%x, because "
 -                                              "the allocation was only "
 -                                              "partially extended.",
 -                                              vi->i_ino, (unsigned)
 -                                              le32_to_cpu(ni->type));
 -                              end = ll;
 -                              count = ll - pos;
 -                      }
 -              } else {
 -                      err = ll;
 -                      read_lock_irqsave(&ni->size_lock, flags);
 -                      ll = ni->allocated_size;
 -                      read_unlock_irqrestore(&ni->size_lock, flags);
 -                      /* Perform a partial write if possible or fail. */
 -                      if (pos < ll) {
 -                              ntfs_debug("Truncating write to inode 0x%lx, "
 -                                              "attribute type 0x%x, because "
 -                                              "extending the allocation "
 -                                              "failed (error code %i).",
 -                                              vi->i_ino, (unsigned)
 -                                              le32_to_cpu(ni->type), err);
 -                              end = ll;
 -                              count = ll - pos;
 -                      } else {
 -                              ntfs_error(vol->sb, "Cannot perform write to "
 -                                              "inode 0x%lx, attribute type "
 -                                              "0x%x, because extending the "
 -                                              "allocation failed (error "
 -                                              "code %i).", vi->i_ino,
 -                                              (unsigned)
 -                                              le32_to_cpu(ni->type), err);
 -                              return err;
 -                      }
 -              }
 -      }
 -      written = 0;
 -      /*
 -       * If the write starts beyond the initialized size, extend it up to the
 -       * beginning of the write and initialize all non-sparse space between
 -       * the old initialized size and the new one.  This automatically also
 -       * increments the vfs inode->i_size to keep it above or equal to the
 -       * initialized_size.
 -       */
 -      read_lock_irqsave(&ni->size_lock, flags);
 -      ll = ni->initialized_size;
 -      read_unlock_irqrestore(&ni->size_lock, flags);
 -      if (pos > ll) {
 -              err = ntfs_attr_extend_initialized(ni, pos);
 -              if (err < 0) {
 -                      ntfs_error(vol->sb, "Cannot perform write to inode "
 -                                      "0x%lx, attribute type 0x%x, because "
 -                                      "extending the initialized size "
 -                                      "failed (error code %i).", vi->i_ino,
 -                                      (unsigned)le32_to_cpu(ni->type), err);
 -                      status = err;
 -                      goto err_out;
 -              }
 -      }
        /*
         * Determine the number of pages per cluster for non-resident
         * attributes.
        nr_pages = 1;
        if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
                nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
 -      /* Finally, perform the actual write. */
        last_vcn = -1;
 -      if (likely(nr_segs == 1))
 -              buf = iov->iov_base;
        do {
                VCN vcn;
                pgoff_t idx, start_idx;
                                                vol->cluster_size_bits, false);
                                up_read(&ni->runlist.lock);
                                if (unlikely(lcn < LCN_HOLE)) {
 -                                      status = -EIO;
                                        if (lcn == LCN_ENOMEM)
                                                status = -ENOMEM;
 -                                      else
 +                                      else {
 +                                              status = -EIO;
                                                ntfs_error(vol->sb, "Cannot "
                                                        "perform write to "
                                                        "inode 0x%lx, "
                                                        "is corrupt.",
                                                        vi->i_ino, (unsigned)
                                                        le32_to_cpu(ni->type));
 +                                      }
                                        break;
                                }
                                if (lcn == LCN_HOLE) {
                                }
                        }
                }
 -              if (bytes > count)
 -                      bytes = count;
 +              if (bytes > iov_iter_count(i))
 +                      bytes = iov_iter_count(i);
 +again:
                /*
                 * Bring in the user page(s) that we will copy from _first_.
                 * Otherwise there is a nasty deadlock on copying from the same
                 * pages being swapped out between us bringing them into memory
                 * and doing the actual copying.
                 */
 -              if (likely(nr_segs == 1))
 -                      ntfs_fault_in_pages_readable(buf, bytes);
 -              else
 -                      ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
 +              if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) {
 +                      status = -EFAULT;
 +                      break;
 +              }
                /* Get and lock @do_pages starting at index @start_idx. */
                status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
                                pages, &cached_page);
                        status = ntfs_prepare_pages_for_non_resident_write(
                                        pages, do_pages, pos, bytes);
                        if (unlikely(status)) {
 -                              loff_t i_size;
 -
                                do {
                                        unlock_page(pages[--do_pages]);
                                        page_cache_release(pages[do_pages]);
                                } while (do_pages);
 -                              /*
 -                               * The write preparation may have instantiated
 -                               * allocated space outside i_size.  Trim this
 -                               * off again.  We can ignore any errors in this
 -                               * case as we will just be waisting a bit of
 -                               * allocated space, which is not a disaster.
 -                               */
 -                              i_size = i_size_read(vi);
 -                              if (pos + bytes > i_size) {
 -                                      ntfs_write_failed(mapping, pos + bytes);
 -                              }
                                break;
                        }
                }
                u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
 -              if (likely(nr_segs == 1)) {
 -                      copied = ntfs_copy_from_user(pages + u, do_pages - u,
 -                                      ofs, buf, bytes);
 -                      buf += copied;
 -              } else
 -                      copied = ntfs_copy_from_user_iovec(pages + u,
 -                                      do_pages - u, ofs, &iov, &iov_ofs,
 -                                      bytes);
 +              copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
 +                                      i, bytes);
                ntfs_flush_dcache_pages(pages + u, do_pages - u);
 -              status = ntfs_commit_pages_after_write(pages, do_pages, pos,
 -                              bytes);
 -              if (likely(!status)) {
 -                      written += copied;
 -                      count -= copied;
 -                      pos += copied;
 -                      if (unlikely(copied != bytes))
 -                              status = -EFAULT;
 +              status = 0;
 +              if (likely(copied == bytes)) {
 +                      status = ntfs_commit_pages_after_write(pages, do_pages,
 +                                      pos, bytes);
 +                      if (!status)
 +                              status = bytes;
                }
                do {
                        unlock_page(pages[--do_pages]);
                        page_cache_release(pages[do_pages]);
                } while (do_pages);
 -              if (unlikely(status))
 +              if (unlikely(status < 0))
                        break;
 -              balance_dirty_pages_ratelimited(mapping);
 +              copied = status;
                cond_resched();
 -      } while (count);
 -err_out:
 -      *ppos = pos;
 +              if (unlikely(!copied)) {
 +                      size_t sc;
 +
 +                      /*
 +                       * We failed to copy anything.  Fall back to single
 +                       * segment length write.
 +                       *
 +                       * This is needed to avoid possible livelock in the
 +                       * case that all segments in the iov cannot be copied
 +                       * at once without a pagefault.
 +                       */
 +                      sc = iov_iter_single_seg_count(i);
 +                      if (bytes > sc)
 +                              bytes = sc;
 +                      goto again;
 +              }
 +              iov_iter_advance(i, copied);
 +              pos += copied;
 +              written += copied;
 +              balance_dirty_pages_ratelimited(mapping);
 +              if (fatal_signal_pending(current)) {
 +                      status = -EINTR;
 +                      break;
 +              }
 +      } while (iov_iter_count(i));
        if (cached_page)
                page_cache_release(cached_page);
        ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
  }
  
  /**
 - * ntfs_file_aio_write_nolock -
 + * ntfs_file_write_iter_nolock - write data to a file
 + * @iocb:     IO state structure (file, offset, etc.)
 + * @from:     iov_iter with data to write
 + *
 + * Basically the same as __generic_file_write_iter() except that it ends
 + * up calling ntfs_perform_write() instead of generic_perform_write() and that
 + * O_DIRECT is not implemented.
   */
 -static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
 -              const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
 +static ssize_t ntfs_file_write_iter_nolock(struct kiocb *iocb,
 +              struct iov_iter *from)
  {
        struct file *file = iocb->ki_filp;
 -      struct address_space *mapping = file->f_mapping;
 -      struct inode *inode = mapping->host;
 -      loff_t pos;
 -      size_t count;           /* after file limit checks */
 -      ssize_t written, err;
 +      loff_t pos = iocb->ki_pos;
 +      ssize_t written = 0;
 +      ssize_t err;
 +      size_t count = iov_iter_count(from);
  
 -      count = iov_length(iov, nr_segs);
 -      pos = *ppos;
 -      /* We can write back this queue in page reclaim. */
 -      current->backing_dev_info = inode_to_bdi(inode);
 -      written = 0;
 -      err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 -      if (err)
 -              goto out;
 -      if (!count)
 -              goto out;
 -      err = file_remove_suid(file);
 -      if (err)
 -              goto out;
 -      err = file_update_time(file);
 -      if (err)
 -              goto out;
 -      written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
 -                      count);
 -out:
 +      err = ntfs_prepare_file_for_write(file, &pos, &count);
 +      if (count && !err) {
 +              iov_iter_truncate(from, count);
 +              written = ntfs_perform_write(file, from, pos);
 +              if (likely(written >= 0))
 +                      iocb->ki_pos = pos + written;
 +      }
        current->backing_dev_info = NULL;
        return written ? written : err;
  }
  
  /**
 - * ntfs_file_aio_write -
 + * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
 + * @iocb:     IO state structure
 + * @from:     iov_iter with data to write
 + *
 + * Basically the same as generic_file_write_iter() except that it ends up
 + * calling ntfs_file_write_iter_nolock() instead of
 + * __generic_file_write_iter().
   */
 -static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 -              unsigned long nr_segs, loff_t pos)
 +static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
        struct file *file = iocb->ki_filp;
 -      struct address_space *mapping = file->f_mapping;
 -      struct inode *inode = mapping->host;
 +      struct inode *vi = file_inode(file);
        ssize_t ret;
  
 -      BUG_ON(iocb->ki_pos != pos);
 -
 -      mutex_lock(&inode->i_mutex);
 -      ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
 -      mutex_unlock(&inode->i_mutex);
 +      mutex_lock(&vi->i_mutex);
 +      ret = ntfs_file_write_iter_nolock(iocb, from);
 +      mutex_unlock(&vi->i_mutex);
        if (ret > 0) {
 -              int err = generic_write_sync(file, iocb->ki_pos - ret, ret);
 +              ssize_t err;
 +
 +              err = generic_write_sync(file, iocb->ki_pos - ret, ret);
                if (err < 0)
                        ret = err;
        }
@@@ -2048,17 -2196,37 +2047,17 @@@ static int ntfs_file_fsync(struct file 
  #endif /* NTFS_RW */
  
  const struct file_operations ntfs_file_ops = {
 -      .llseek         = generic_file_llseek,   /* Seek inside file. */
 -      .read           = new_sync_read,         /* Read from file. */
 -      .read_iter      = generic_file_read_iter, /* Async read from file. */
 +      .llseek         = generic_file_llseek,
 +      .read           = new_sync_read,
 +      .read_iter      = generic_file_read_iter,
  #ifdef NTFS_RW
 -      .write          = do_sync_write,         /* Write to file. */
 -      .aio_write      = ntfs_file_aio_write,   /* Async write to file. */
 -      /*.release      = ,*/                    /* Last file is closed.  See
 -                                                  fs/ext2/file.c::
 -                                                  ext2_release_file() for
 -                                                  how to use this to discard
 -                                                  preallocated space for
 -                                                  write opened files. */
 -      .fsync          = ntfs_file_fsync,       /* Sync a file to disk. */
 -      /*.aio_fsync    = ,*/                    /* Sync all outstanding async
 -                                                  i/o operations on a
 -                                                  kiocb. */
 +      .write          = new_sync_write,
 +      .write_iter     = ntfs_file_write_iter,
 +      .fsync          = ntfs_file_fsync,
  #endif /* NTFS_RW */
 -      /*.ioctl        = ,*/                    /* Perform function on the
 -                                                  mounted filesystem. */
 -      .mmap           = generic_file_mmap,     /* Mmap file. */
 -      .open           = ntfs_file_open,        /* Open file. */
 -      .splice_read    = generic_file_splice_read /* Zero-copy data send with
 -                                                  the data source being on
 -                                                  the ntfs partition.  We do
 -                                                  not need to care about the
 -                                                  data destination. */
 -      /*.sendpage     = ,*/                    /* Zero-copy data send with
 -                                                  the data destination being
 -                                                  on the ntfs partition.  We
 -                                                  do not need to care about
 -                                                  the data source. */
 +      .mmap           = generic_file_mmap,
 +      .open           = ntfs_file_open,
 +      .splice_read    = generic_file_splice_read,
  };
  
  const struct inode_operations ntfs_file_inode_ops = {
diff --combined fs/xfs/xfs_file.c
@@@ -36,9 -36,7 +36,8 @@@
  #include "xfs_trace.h"
  #include "xfs_log.h"
  #include "xfs_icache.h"
 +#include "xfs_pnfs.h"
  
- #include <linux/aio.h>
  #include <linux/dcache.h>
  #include <linux/falloc.h>
  #include <linux/pagevec.h>
@@@ -397,8 -395,7 +396,8 @@@ STATIC int                         /* error (positive) *
  xfs_zero_last_block(
        struct xfs_inode        *ip,
        xfs_fsize_t             offset,
 -      xfs_fsize_t             isize)
 +      xfs_fsize_t             isize,
 +      bool                    *did_zeroing)
  {
        struct xfs_mount        *mp = ip->i_mount;
        xfs_fileoff_t           last_fsb = XFS_B_TO_FSBT(mp, isize);
        zero_len = mp->m_sb.sb_blocksize - zero_offset;
        if (isize + zero_len > offset)
                zero_len = offset - isize;
 +      *did_zeroing = true;
        return xfs_iozero(ip, isize, zero_len);
  }
  
@@@ -445,8 -441,7 +444,8 @@@ int                                        /* error (positive) *
  xfs_zero_eof(
        struct xfs_inode        *ip,
        xfs_off_t               offset,         /* starting I/O offset */
 -      xfs_fsize_t             isize)          /* current inode size */
 +      xfs_fsize_t             isize,          /* current inode size */
 +      bool                    *did_zeroing)
  {
        struct xfs_mount        *mp = ip->i_mount;
        xfs_fileoff_t           start_zero_fsb;
         * We only zero a part of that block so it is handled specially.
         */
        if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
 -              error = xfs_zero_last_block(ip, offset, isize);
 +              error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
                if (error)
                        return error;
        }
                if (error)
                        return error;
  
 +              *did_zeroing = true;
                start_zero_fsb = imap.br_startoff + imap.br_blockcount;
                ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
        }
@@@ -559,10 -553,6 +558,10 @@@ restart
        if (error)
                return error;
  
 +      error = xfs_break_layouts(inode, iolock);
 +      if (error)
 +              return error;
 +
        /*
         * If the offset is beyond the size of the file, we need to zero any
         * blocks that fall between the existing EOF and the start of this
         * having to redo all checks before.
         */
        if (*pos > i_size_read(inode)) {
 +              bool    zero = false;
 +
                if (*iolock == XFS_IOLOCK_SHARED) {
                        xfs_rw_iunlock(ip, *iolock);
                        *iolock = XFS_IOLOCK_EXCL;
                        xfs_rw_ilock(ip, *iolock);
                        goto restart;
                }
 -              error = xfs_zero_eof(ip, *pos, i_size_read(inode));
 +              error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
                if (error)
                        return error;
        }
@@@ -833,7 -821,6 +832,7 @@@ xfs_file_fallocate
        struct xfs_inode        *ip = XFS_I(inode);
        long                    error;
        enum xfs_prealloc_flags flags = 0;
 +      uint                    iolock = XFS_IOLOCK_EXCL;
        loff_t                  new_size = 0;
  
        if (!S_ISREG(inode->i_mode))
                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
  
 -      xfs_ilock(ip, XFS_IOLOCK_EXCL);
 +      xfs_ilock(ip, iolock);
 +      error = xfs_break_layouts(inode, &iolock);
 +      if (error)
 +              goto out_unlock;
 +
        if (mode & FALLOC_FL_PUNCH_HOLE) {
                error = xfs_free_file_space(ip, offset, len);
                if (error)
        }
  
  out_unlock:
 -      xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 +      xfs_iunlock(ip, iolock);
        return error;
  }
  
diff --combined include/linux/fs.h
@@@ -314,6 -314,28 +314,28 @@@ struct page
  struct address_space;
  struct writeback_control;
  
+ #define IOCB_EVENTFD          (1 << 0)
+ struct kiocb {
+       struct file             *ki_filp;
+       loff_t                  ki_pos;
+       void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
+       void                    *private;
+       int                     ki_flags;
+ };
+ static inline bool is_sync_kiocb(struct kiocb *kiocb)
+ {
+       return kiocb->ki_complete == NULL;
+ }
+ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
+ {
+       *kiocb = (struct kiocb) {
+               .ki_filp = filp,
+       };
+ }
  /*
   * "descriptor" for what we're up to with a read.
   * This allows us to use the same read code yet
@@@ -968,6 -990,9 +990,6 @@@ struct file_lock_context 
        struct list_head        flc_flock;
        struct list_head        flc_posix;
        struct list_head        flc_lease;
 -      int                     flc_flock_cnt;
 -      int                     flc_posix_cnt;
 -      int                     flc_lease_cnt;
  };
  
  /* The following constant reflects the upper bound of the file/locking space */
@@@ -2144,7 -2169,7 +2166,7 @@@ struct filename 
        const __user char       *uptr;  /* original userland pointer */
        struct audit_names      *aname;
        int                     refcnt;
 -      bool                    separate; /* should "name" be freed? */
 +      const char              iname[];
  };
  
  extern long vfs_truncate(struct path *, loff_t);
diff --combined kernel/printk/printk.c
@@@ -32,7 -32,6 +32,6 @@@
  #include <linux/security.h>
  #include <linux/bootmem.h>
  #include <linux/memblock.h>
- #include <linux/aio.h>
  #include <linux/syscalls.h>
  #include <linux/kexec.h>
  #include <linux/kdb.h>
@@@ -46,6 -45,7 +45,7 @@@
  #include <linux/irq_work.h>
  #include <linux/utsname.h>
  #include <linux/ctype.h>
+ #include <linux/uio.h>
  
  #include <asm/uaccess.h>
  
@@@ -521,7 -521,7 +521,7 @@@ static ssize_t devkmsg_write(struct kio
        int i;
        int level = default_message_loglevel;
        int facility = 1;       /* LOG_USER */
-       size_t len = iocb->ki_nbytes;
+       size_t len = iov_iter_count(from);
        ssize_t ret = len;
  
        if (len > LOG_LINE_MAX)
@@@ -1811,7 -1811,7 +1811,7 @@@ int vprintk_default(const char *fmt, va
  
  #ifdef CONFIG_KGDB_KDB
        if (unlikely(kdb_trap_printk)) {
 -              r = vkdb_printf(fmt, args);
 +              r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
                return r;
        }
  #endif
@@@ -2464,7 -2464,6 +2464,7 @@@ void register_console(struct console *n
        for (i = 0, c = console_cmdline;
             i < MAX_CMDLINECONSOLES && c->name[0];
             i++, c++) {
 +              BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
                if (strcmp(c->name, newcon->name) != 0)
                        continue;
                if (newcon->index >= 0 &&
diff --combined mm/shmem.c
@@@ -31,7 -31,7 +31,7 @@@
  #include <linux/mm.h>
  #include <linux/export.h>
  #include <linux/swap.h>
- #include <linux/aio.h>
+ #include <linux/uio.h>
  
  static struct vfsmount *shm_mnt;
  
@@@ -1455,9 -1455,6 +1455,9 @@@ static struct inode *shmem_get_inode(st
  
  bool shmem_mapping(struct address_space *mapping)
  {
 +      if (!mapping->host)
 +              return false;
 +
        return mapping->host->i_sb->s_op == &shmem_ops;
  }
  
@@@ -2322,8 -2319,8 +2322,8 @@@ static int shmem_rmdir(struct inode *di
  
  static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
  {
 -      bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
 -      bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode);
 +      bool old_is_dir = d_is_dir(old_dentry);
 +      bool new_is_dir = d_is_dir(new_dentry);
  
        if (old_dir != new_dir && old_is_dir != new_is_dir) {
                if (old_is_dir) {
diff --combined net/socket.c
@@@ -633,8 -633,7 +633,7 @@@ static int do_sock_sendmsg(struct socke
        init_sync_kiocb(&iocb, NULL);
        ret = nosec ? __sock_sendmsg_nosec(&iocb, sock, msg, size) :
                      __sock_sendmsg(&iocb, sock, msg, size);
-       if (-EIOCBQUEUED == ret)
-               ret = wait_on_sync_kiocb(&iocb);
+       BUG_ON(ret == -EIOCBQUEUED);
        return ret;
  }
  
@@@ -766,8 -765,7 +765,7 @@@ int sock_recvmsg(struct socket *sock, s
  
        init_sync_kiocb(&iocb, NULL);
        ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
-       if (-EIOCBQUEUED == ret)
-               ret = wait_on_sync_kiocb(&iocb);
+       BUG_ON(ret == -EIOCBQUEUED);
        return ret;
  }
  EXPORT_SYMBOL(sock_recvmsg);
@@@ -780,8 -778,7 +778,7 @@@ static int sock_recvmsg_nosec(struct so
  
        init_sync_kiocb(&iocb, NULL);
        ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
-       if (-EIOCBQUEUED == ret)
-               ret = wait_on_sync_kiocb(&iocb);
+       BUG_ON(ret == -EIOCBQUEUED);
        return ret;
  }
  
@@@ -858,11 -855,11 +855,11 @@@ static ssize_t sock_read_iter(struct ki
        if (iocb->ki_pos != 0)
                return -ESPIPE;
  
-       if (iocb->ki_nbytes == 0)       /* Match SYS5 behaviour */
+       if (!iov_iter_count(to))        /* Match SYS5 behaviour */
                return 0;
  
        res = __sock_recvmsg(iocb, sock, &msg,
-                            iocb->ki_nbytes, msg.msg_flags);
+                            iov_iter_count(to), msg.msg_flags);
        *to = msg.msg_iter;
        return res;
  }
@@@ -883,7 -880,7 +880,7 @@@ static ssize_t sock_write_iter(struct k
        if (sock->type == SOCK_SEQPACKET)
                msg.msg_flags |= MSG_EOR;
  
-       res = __sock_sendmsg(iocb, sock, &msg, iocb->ki_nbytes);
+       res = __sock_sendmsg(iocb, sock, &msg, iov_iter_count(from));
        *from = msg.msg_iter;
        return res;
  }
@@@ -1702,8 -1699,6 +1699,8 @@@ SYSCALL_DEFINE6(sendto, int, fd, void _
  
        if (len > INT_MAX)
                len = INT_MAX;
 +      if (unlikely(!access_ok(VERIFY_READ, buff, len)))
 +              return -EFAULT;
        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;
@@@ -1762,8 -1757,6 +1759,8 @@@ SYSCALL_DEFINE6(recvfrom, int, fd, voi
  
        if (size > INT_MAX)
                size = INT_MAX;
 +      if (unlikely(!access_ok(VERIFY_WRITE, ubuf, size)))
 +              return -EFAULT;
        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;
diff --combined sound/core/pcm_native.c
@@@ -25,7 -25,6 +25,6 @@@
  #include <linux/slab.h>
  #include <linux/time.h>
  #include <linux/pm_qos.h>
- #include <linux/aio.h>
  #include <linux/io.h>
  #include <linux/dma-mapping.h>
  #include <sound/core.h>
@@@ -35,6 -34,7 +34,7 @@@
  #include <sound/pcm_params.h>
  #include <sound/timer.h>
  #include <sound/minors.h>
+ #include <linux/uio.h>
  
  /*
   *  Compatibility
@@@ -1552,8 -1552,6 +1552,8 @@@ static int snd_pcm_do_drain_init(struc
                        if (! snd_pcm_playback_empty(substream)) {
                                snd_pcm_do_start(substream, SNDRV_PCM_STATE_DRAINING);
                                snd_pcm_post_start(substream, SNDRV_PCM_STATE_DRAINING);
 +                      } else {
 +                              runtime->status->state = SNDRV_PCM_STATE_SETUP;
                        }
                        break;
                case SNDRV_PCM_STATE_RUNNING: