Merge branch 'iocb' into for-next

author Al Viro <viro@zeniv.linux.org.uk>

Sun, 12 Apr 2015 02:24:41 +0000 (22:24 -0400)

committer Al Viro <viro@zeniv.linux.org.uk>

Sun, 12 Apr 2015 02:24:41 +0000 (22:24 -0400)
author Al Viro <viro@zeniv.linux.org.uk>
Sun, 12 Apr 2015 02:24:41 +0000 (22:24 -0400)
committer Al Viro <viro@zeniv.linux.org.uk>
Sun, 12 Apr 2015 02:24:41 +0000 (22:24 -0400)
diff --combined arch/s390/hypfs/inode.c

index 99824ff,ad66b07..df7d8cb
--- 1/arch/s390/hypfs/inode.c
--- 2/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@@ -21,7 -21,7 +21,7 @@@
   #include <linux/module.h>
   #include <linux/seq_file.h>
   #include <linux/mount.h>
- #include <linux/aio.h>
+ #include <linux/uio.h>
   #include <asm/ebcdic.h>
   #include "hypfs.h"
   
@@@ -74,7 -74,7 +74,7 @@@ static void hypfs_remove(struct dentry 
         parent = dentry->d_parent;
         mutex_lock(&parent->d_inode->i_mutex);
         if (hypfs_positive(dentry)) {
- -              if (S_ISDIR(dentry->d_inode->i_mode))
+ +              if (d_is_dir(dentry))
                         simple_rmdir(parent->d_inode, dentry);
                 else
                         simple_unlink(parent->d_inode, dentry);
@@@ -144,32 -144,36 +144,32 @@@ static int hypfs_open(struct inode *ino
         return nonseekable_open(inode, filp);
   }
   
- -static ssize_t hypfs_aio_read(struct kiocb *iocb, const struct iovec *iov,
- -                            unsigned long nr_segs, loff_t offset)
+ +static ssize_t hypfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
   {
- -      char *data;
- -      ssize_t ret;
- -      struct file *filp = iocb->ki_filp;
- -      /* XXX: temporary */
- -      char __user *buf = iov[0].iov_base;
- -      size_t count = iov[0].iov_len;
- -
- -      if (nr_segs != 1)
- -              return -EINVAL;
- -
- -      data = filp->private_data;
- -      ret = simple_read_from_buffer(buf, count, &offset, data, strlen(data));
- -      if (ret <= 0)
- -              return ret;
+ +      struct file *file = iocb->ki_filp;
+ +      char *data = file->private_data;
+ +      size_t available = strlen(data);
+ +      loff_t pos = iocb->ki_pos;
+ +      size_t count;
   
- -      iocb->ki_pos += ret;
- -      file_accessed(filp);
- -
- -      return ret;
+ +      if (pos < 0)
+ +              return -EINVAL;
+ +      if (pos >= available || !iov_iter_count(to))
+ +              return 0;
+ +      count = copy_to_iter(data + pos, available - pos, to);
+ +      if (!count)
+ +              return -EFAULT;
+ +      iocb->ki_pos = pos + count;
+ +      file_accessed(file);
+ +      return count;
   }
- -static ssize_t hypfs_aio_write(struct kiocb *iocb, const struct iovec *iov,
- -                            unsigned long nr_segs, loff_t offset)
+ +
+ +static ssize_t hypfs_write_iter(struct kiocb *iocb, struct iov_iter *from)
   {
         int rc;
         struct super_block *sb = file_inode(iocb->ki_filp)->i_sb;
         struct hypfs_sb_info *fs_info = sb->s_fs_info;
- -      size_t count = iov_length(iov, nr_segs);
+ +      size_t count = iov_iter_count(from);
   
         /*
          * Currently we only allow one update per second for two reasons:
@@@ -198,7 -202,6 +198,7 @@@
         }
         hypfs_update_update(sb);
         rc = count;
+ +      iov_iter_advance(from, count);
   out:
         mutex_unlock(&fs_info->lock);
         return rc;
@@@ -437,10 -440,10 +437,10 @@@ struct dentry *hypfs_create_str(struct 
   static const struct file_operations hypfs_file_ops = {
         .open           = hypfs_open,
         .release        = hypfs_release,
- -      .read           = do_sync_read,
- -      .write          = do_sync_write,
- -      .aio_read       = hypfs_aio_read,
- -      .aio_write      = hypfs_aio_write,
+ +      .read           = new_sync_read,
+ +      .write          = new_sync_write,
+ +      .read_iter      = hypfs_read_iter,
+ +      .write_iter     = hypfs_write_iter,
         .llseek         = no_llseek,
   };
   
diff --combined drivers/infiniband/hw/qib/qib_file_ops.c

index 41937c6,826c17e..14046f5
--- 1/drivers/infiniband/hw/qib/qib_file_ops.c
--- 2/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@@ -39,7 -39,6 +39,6 @@@
   #include <linux/vmalloc.h>
   #include <linux/highmem.h>
   #include <linux/io.h>
- #include <linux/aio.h>
   #include <linux/jiffies.h>
   #include <asm/pgtable.h>
   #include <linux/delay.h>
@@@ -351,10 -350,9 +350,10 @@@ static int qib_tid_update(struct qib_ct
                  * unless perhaps the user has mpin'ed the pages
                  * themselves.
                  */
- -              qib_devinfo(dd->pcidev,
- -                       "Failed to lock addr %p, %u pages: "
- -                       "errno %d\n", (void *) vaddr, cnt, -ret);
+ +              qib_devinfo(
+ +                      dd->pcidev,
+ +                      "Failed to lock addr %p, %u pages: errno %d\n",
+ +                      (void *) vaddr, cnt, -ret);
                 goto done;
         }
         for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
@@@ -438,7 -436,7 +437,7 @@@ cleanup
                         goto cleanup;
                 }
                 if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
- -                               tidmap, sizeof tidmap)) {
+ +                               tidmap, sizeof(tidmap))) {
                         ret = -EFAULT;
                         goto cleanup;
                 }
@@@ -485,7 -483,7 +484,7 @@@ static int qib_tid_free(struct qib_ctxt
         }
   
         if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
- -                         sizeof tidmap)) {
+ +                         sizeof(tidmap))) {
                 ret = -EFAULT;
                 goto done;
         }
@@@ -952,8 -950,8 +951,8 @@@ static int mmap_kvaddr(struct vm_area_s
                 /* rcvegrbufs are read-only on the slave */
                 if (vma->vm_flags & VM_WRITE) {
                         qib_devinfo(dd->pcidev,
- -                               "Can't map eager buffers as "
- -                               "writable (flags=%lx)\n", vma->vm_flags);
+ +                               "Can't map eager buffers as writable (flags=%lx)\n",
+ +                               vma->vm_flags);
                         ret = -EPERM;
                         goto bail;
                 }
@@@ -1186,7 -1184,6 +1185,7 @@@ static void assign_ctxt_affinity(struc
          */
         if (weight >= qib_cpulist_count) {
                 int cpu;
+ +
                 cpu = find_first_zero_bit(qib_cpulist,
                                           qib_cpulist_count);
                 if (cpu == qib_cpulist_count)
@@@ -1249,7 -1246,10 +1248,7 @@@ static int init_subctxts(struct qib_dev
         if (!qib_compatible_subctxts(uinfo->spu_userversion >> 16,
                 uinfo->spu_userversion & 0xffff)) {
                 qib_devinfo(dd->pcidev,
- -                       "Mismatched user version (%d.%d) and driver "
- -                       "version (%d.%d) while context sharing. Ensure "
- -                       "that driver and library are from the same "
- -                       "release.\n",
+ +                       "Mismatched user version (%d.%d) and driver version (%d.%d) while context sharing. Ensure that driver and library are from the same release.\n",
                          (int) (uinfo->spu_userversion >> 16),
                          (int) (uinfo->spu_userversion & 0xffff),
                          QIB_USER_SWMAJOR, QIB_USER_SWMINOR);
@@@ -1390,7 -1390,6 +1389,7 @@@ static int choose_port_ctxt(struct fil
         }
         if (!ppd) {
                 u32 pidx = ctxt % dd->num_pports;
+ +
                 if (usable(dd->pport + pidx))
                         ppd = dd->pport + pidx;
                 else {
@@@ -1438,12 -1437,10 +1437,12 @@@ static int get_a_ctxt(struct file *fp, 
   
         if (alg == QIB_PORT_ALG_ACROSS) {
                 unsigned inuse = ~0U;
+ +
                 /* find device (with ACTIVE ports) with fewest ctxts in use */
                 for (ndev = 0; ndev < devmax; ndev++) {
                         struct qib_devdata *dd = qib_lookup(ndev);
                         unsigned cused = 0, cfree = 0, pusable = 0;
+ +
                         if (!dd)
                                 continue;
                         if (port && port <= dd->num_pports &&
@@@ -1473,7 -1470,6 +1472,7 @@@
         } else {
                 for (ndev = 0; ndev < devmax; ndev++) {
                         struct qib_devdata *dd = qib_lookup(ndev);
+ +
                         if (dd) {
                                 ret = choose_port_ctxt(fp, dd, port, uinfo);
                                 if (!ret)
@@@ -1559,7 -1555,6 +1558,7 @@@ static int find_hca(unsigned int cpu, i
         }
         for (ndev = 0; ndev < devmax; ndev++) {
                 struct qib_devdata *dd = qib_lookup(ndev);
+ +
                 if (dd) {
                         if (pcibus_to_node(dd->pcidev->bus) < 0) {
                                 ret = -EINVAL;
diff --combined drivers/scsi/sg.c

index 2270bd5,c78a6f7..d383f84
--- 1/drivers/scsi/sg.c
--- 2/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@@ -33,7 -33,6 +33,6 @@@ static int sg_version_num = 30536;    /* 
   #include <linux/sched.h>
   #include <linux/string.h>
   #include <linux/mm.h>
- #include <linux/aio.h>
   #include <linux/errno.h>
   #include <linux/mtio.h>
   #include <linux/ioctl.h>
@@@ -51,6 -50,7 +50,7 @@@
   #include <linux/mutex.h>
   #include <linux/atomic.h>
   #include <linux/ratelimit.h>
+ #include <linux/uio.h>
   
   #include "scsi.h"
   #include <scsi/scsi_dbg.h>
@@@ -546,7 -546,7 +546,7 @@@ static ssize_
   sg_new_read(Sg_fd * sfp, char __user *buf, size_t count, Sg_request * srp)
   {
         sg_io_hdr_t *hp = &srp->header;
- -      int err = 0;
+ +      int err = 0, err2;
         int len;
   
         if (count < SZ_SG_IO_HDR) {
@@@ -575,8 -575,8 +575,8 @@@
                 goto err_out;
         }
   err_out:
- -      err = sg_finish_rem_req(srp);
- -      return (0 == err) ? count : err;
+ +      err2 = sg_finish_rem_req(srp);
+ +      return err ? : err2 ? : count;
   }
   
   static ssize_t
@@@ -1335,17 -1335,6 +1335,17 @@@ sg_rq_end_io(struct request *rq, int up
         }
         /* Rely on write phase to clean out srp status values, so no "else" */
   
+ +      /*
+ +       * Free the request as soon as it is complete so that its resources
+ +       * can be reused without waiting for userspace to read() the
+ +       * result.  But keep the associated bio (if any) around until
+ +       * blk_rq_unmap_user() can be called from user context.
+ +       */
+ +      srp->rq = NULL;
+ +      if (rq->cmd != rq->__cmd)
+ +              kfree(rq->cmd);
+ +      __blk_put_request(rq->q, rq);
+ +
         write_lock_irqsave(&sfp->rq_list_lock, iflags);
         if (unlikely(srp->orphan)) {
                 if (sfp->keep_orphan)
@@@ -1680,22 -1669,7 +1680,22 @@@ sg_start_req(Sg_request *srp, unsigned 
                         return -ENOMEM;
         }
   
- -      rq = blk_get_request(q, rw, GFP_ATOMIC);
+ +      /*
+ +       * NOTE
+ +       *
+ +       * With scsi-mq enabled, there are a fixed number of preallocated
+ +       * requests equal in number to shost->can_queue.  If all of the
+ +       * preallocated requests are already in use, then using GFP_ATOMIC with
+ +       * blk_get_request() will return -EWOULDBLOCK, whereas using GFP_KERNEL
+ +       * will cause blk_get_request() to sleep until an active command
+ +       * completes, freeing up a request.  Neither option is ideal, but
+ +       * GFP_KERNEL is the better choice to prevent userspace from getting an
+ +       * unexpected EWOULDBLOCK.
+ +       *
+ +       * With scsi-mq disabled, blk_get_request() with GFP_KERNEL usually
+ +       * does not sleep except under memory pressure.
+ +       */
+ +      rq = blk_get_request(q, rw, GFP_KERNEL);
         if (IS_ERR(rq)) {
                 kfree(long_cmdp);
                 return PTR_ERR(rq);
@@@ -1785,10 -1759,10 +1785,10 @@@ sg_finish_rem_req(Sg_request *srp
         SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sfp->parentdp,
                                       "sg_finish_rem_req: res_used=%d\n",
                                       (int) srp->res_used));
- -      if (srp->rq) {
- -              if (srp->bio)
- -                      ret = blk_rq_unmap_user(srp->bio);
+ +      if (srp->bio)
+ +              ret = blk_rq_unmap_user(srp->bio);
   
+ +      if (srp->rq) {
                 if (srp->rq->cmd != srp->rq->__cmd)
                         kfree(srp->rq->cmd);
                 blk_put_request(srp->rq);
diff --combined fs/aio.c

index f8e52a1,9582865..435ca29
--- 1/fs/aio.c
--- 2/fs/aio.c
+++ b/fs/aio.c
@@@ -151,6 -151,38 +151,38 @@@ struct kioctx 
         unsigned                id;
   };
   
+ /*
+  * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+  * cancelled or completed (this makes a certain amount of sense because
+  * successful cancellation - io_cancel() - does deliver the completion to
+  * userspace).
+  *
+  * And since most things don't implement kiocb cancellation and we'd really like
+  * kiocb completion to be lockless when possible, we use ki_cancel to
+  * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+  * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
+  */
+ #define KIOCB_CANCELLED               ((void *) (~0ULL))
+ 
+ struct aio_kiocb {
+       struct kiocb            common;
+ 
+       struct kioctx           *ki_ctx;
+       kiocb_cancel_fn         *ki_cancel;
+ 
+       struct iocb __user      *ki_user_iocb;  /* user's aiocb */
+       __u64                   ki_user_data;   /* user's data for completion */
+ 
+       struct list_head        ki_list;        /* the aio core uses this
+                                                * for cancellation */
+ 
+       /*
+        * If the aio_resfd field of the userspace iocb is not zero,
+        * this is the underlying eventfd context to deliver events to.
+        */
+       struct eventfd_ctx      *ki_eventfd;
+ };
+ 
   /*------ sysctl variables----*/
   static DEFINE_SPINLOCK(aio_nr_lock);
   unsigned long aio_nr;         /* current system wide number of aio requests */
@@@ -220,7 -252,7 +252,7 @@@ static int __init aio_setup(void
         if (IS_ERR(aio_mnt))
                 panic("Failed to create aio fs mount.");
   
-       kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+       kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
         kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
   
         pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
@@@ -480,8 -512,9 +512,9 @@@ static int aio_setup_ring(struct kioct
   #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
   #define AIO_EVENTS_OFFSET     (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
   
- void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
   {
+       struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
         struct kioctx *ctx = req->ki_ctx;
         unsigned long flags;
   
@@@ -496,7 -529,7 +529,7 @@@
   }
   EXPORT_SYMBOL(kiocb_set_cancel_fn);
   
- static int kiocb_cancel(struct kiocb *kiocb)
+ static int kiocb_cancel(struct aio_kiocb *kiocb)
   {
         kiocb_cancel_fn *old, *cancel;
   
@@@ -514,7 -547,7 +547,7 @@@
                 cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
         } while (cancel != old);
   
-       return cancel(kiocb);
+       return cancel(&kiocb->common);
   }
   
   static void free_ioctx(struct work_struct *work)
@@@ -550,13 -583,13 +583,13 @@@ static void free_ioctx_reqs(struct perc
   static void free_ioctx_users(struct percpu_ref *ref)
   {
         struct kioctx *ctx = container_of(ref, struct kioctx, users);
-       struct kiocb *req;
+       struct aio_kiocb *req;
   
         spin_lock_irq(&ctx->ctx_lock);
   
         while (!list_empty(&ctx->active_reqs)) {
                 req = list_first_entry(&ctx->active_reqs,
-                                      struct kiocb, ki_list);
+                                      struct aio_kiocb, ki_list);
   
                 list_del_init(&req->ki_list);
                 kiocb_cancel(req);
@@@ -778,22 -811,6 +811,6 @@@ static int kill_ioctx(struct mm_struct 
         return 0;
   }
   
- /* wait_on_sync_kiocb:
-  *    Waits on the given sync kiocb to complete.
-  */
- ssize_t wait_on_sync_kiocb(struct kiocb *req)
- {
-       while (!req->ki_ctx) {
-               set_current_state(TASK_UNINTERRUPTIBLE);
-               if (req->ki_ctx)
-                       break;
-               io_schedule();
-       }
-       __set_current_state(TASK_RUNNING);
-       return req->ki_user_data;
- }
- EXPORT_SYMBOL(wait_on_sync_kiocb);
- 
   /*
    * exit_aio: called when the last user of mm goes away.  At this point, there is
    * no way for any new requests to be submited or any of the io_* syscalls to be
@@@ -948,9 -965,9 +965,9 @@@ static void user_refill_reqs_available(
    *    Allocate a slot for an aio request.
    * Returns NULL if no requests are free.
    */
- static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
   {
-       struct kiocb *req;
+       struct aio_kiocb *req;
   
         if (!get_reqs_available(ctx)) {
                 user_refill_reqs_available(ctx);
@@@ -971,10 -988,10 +988,10 @@@ out_put
         return NULL;
   }
   
- static void kiocb_free(struct kiocb *req)
+ static void kiocb_free(struct aio_kiocb *req)
   {
-       if (req->ki_filp)
-               fput(req->ki_filp);
+       if (req->common.ki_filp)
+               fput(req->common.ki_filp);
         if (req->ki_eventfd != NULL)
                 eventfd_ctx_put(req->ki_eventfd);
         kmem_cache_free(kiocb_cachep, req);
@@@ -1010,8 -1027,9 +1027,9 @@@ out
   /* aio_complete
    *    Called when the io request on the given iocb is complete.
    */
- void aio_complete(struct kiocb *iocb, long res, long res2)
+ static void aio_complete(struct kiocb *kiocb, long res, long res2)
   {
+       struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
         struct kioctx   *ctx = iocb->ki_ctx;
         struct aio_ring *ring;
         struct io_event *ev_page, *event;
@@@ -1025,13 -1043,7 +1043,7 @@@
          *    ref, no other paths have a way to get another ref
          *  - the sync task helpfully left a reference to itself in the iocb
          */
-       if (is_sync_kiocb(iocb)) {
-               iocb->ki_user_data = res;
-               smp_wmb();
-               iocb->ki_ctx = ERR_PTR(-EXDEV);
-               wake_up_process(iocb->ki_obj.tsk);
-               return;
-       }
+       BUG_ON(is_sync_kiocb(kiocb));
   
         if (iocb->ki_list.next) {
                 unsigned long flags;
@@@ -1057,7 -1069,7 +1069,7 @@@
         ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
         event = ev_page + pos % AIO_EVENTS_PER_PAGE;
   
-       event->obj = (u64)(unsigned long)iocb->ki_obj.user;
+       event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
         event->data = iocb->ki_user_data;
         event->res = res;
         event->res2 = res2;
@@@ -1066,7 -1078,7 +1078,7 @@@
         flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
   
         pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
-                ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+                ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
                  res, res2);
   
         /* after flagging the request as done, we
@@@ -1113,7 -1125,6 +1125,6 @@@
   
         percpu_ref_put(&ctx->reqs);
   }
- EXPORT_SYMBOL(aio_complete);
   
   /* aio_read_events_ring
    *    Pull an event off of the ioctx's event ring.  Returns the number of
@@@ -1285,7 -1296,7 +1296,7 @@@ SYSCALL_DEFINE2(io_setup, unsigned, nr_
   
         ret = -EINVAL;
         if (unlikely(ctx || nr_events == 0)) {
- -              pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
+ +              pr_debug("EINVAL: ctx %lu nr_events %u\n",
                          ctx, nr_events);
                 goto out;
         }
@@@ -1333,7 -1344,7 +1344,7 @@@ SYSCALL_DEFINE1(io_destroy, aio_context
   
                 return ret;
         }
- -      pr_debug("EINVAL: io_destroy: invalid context id\n");
+ +      pr_debug("EINVAL: invalid context id\n");
         return -EINVAL;
   }
   
@@@ -1344,12 -1355,13 +1355,13 @@@ typedef ssize_t (rw_iter_op)(struct kio
   static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
                                      int rw, char __user *buf,
                                      unsigned long *nr_segs,
+                                    size_t *len,
                                      struct iovec **iovec,
                                      bool compat)
   {
         ssize_t ret;
   
-       *nr_segs = kiocb->ki_nbytes;
+       *nr_segs = *len;
   
   #ifdef CONFIG_COMPAT
         if (compat)
@@@ -1364,21 -1376,22 +1376,22 @@@
         if (ret < 0)
                 return ret;
   
-       /* ki_nbytes now reflect bytes instead of segs */
-       kiocb->ki_nbytes = ret;
+       /* len now reflect bytes instead of segs */
+       *len = ret;
         return 0;
   }
   
   static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
                                        int rw, char __user *buf,
                                        unsigned long *nr_segs,
+                                      size_t len,
                                        struct iovec *iovec)
   {
-       if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
+       if (unlikely(!access_ok(!rw, buf, len)))
                 return -EFAULT;
   
         iovec->iov_base = buf;
-       iovec->iov_len = kiocb->ki_nbytes;
+       iovec->iov_len = len;
         *nr_segs = 1;
         return 0;
   }
@@@ -1388,7 -1401,7 +1401,7 @@@
    *    Performs the initial checks and io submission.
    */
   static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
-                           char __user *buf, bool compat)
+                           char __user *buf, size_t len, bool compat)
   {
         struct file *file = req->ki_filp;
         ssize_t ret;
@@@ -1423,21 -1436,21 +1436,21 @@@ rw_common
                 if (!rw_op && !iter_op)
                         return -EINVAL;
   
-               ret = (opcode == IOCB_CMD_PREADV ||
-                      opcode == IOCB_CMD_PWRITEV)
-                       ? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
-                                               &iovec, compat)
-                       : aio_setup_single_vector(req, rw, buf, &nr_segs,
-                                                 iovec);
+               if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV)
+                       ret = aio_setup_vectored_rw(req, rw, buf, &nr_segs,
+                                               &len, &iovec, compat);
+               else
+                       ret = aio_setup_single_vector(req, rw, buf, &nr_segs,
+                                                 len, iovec);
                 if (!ret)
-                       ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+                       ret = rw_verify_area(rw, file, &req->ki_pos, len);
                 if (ret < 0) {
                         if (iovec != inline_vecs)
                                 kfree(iovec);
                         return ret;
                 }
   
-               req->ki_nbytes = ret;
+               len = ret;
   
                 /* XXX: move/kill - rw_verify_area()? */
                 /* This matches the pread()/pwrite() logic */
@@@ -1450,7 -1463,7 +1463,7 @@@
                         file_start_write(file);
   
                 if (iter_op) {
-                       iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
+                       iov_iter_init(&iter, rw, iovec, nr_segs, len);
                         ret = iter_op(req, &iter);
                 } else {
                         ret = rw_op(req, iovec, nr_segs, req->ki_pos);
@@@ -1500,7 -1513,7 +1513,7 @@@
   static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                          struct iocb *iocb, bool compat)
   {
-       struct kiocb *req;
+       struct aio_kiocb *req;
         ssize_t ret;
   
         /* enforce forwards compatibility on users */
@@@ -1515,7 -1528,7 +1528,7 @@@
             (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
             ((ssize_t)iocb->aio_nbytes < 0)
            )) {
- -              pr_debug("EINVAL: io_submit: overflow check\n");
+ +              pr_debug("EINVAL: overflow check\n");
                 return -EINVAL;
         }
   
@@@ -1523,11 -1536,14 +1536,14 @@@
         if (unlikely(!req))
                 return -EAGAIN;
   
-       req->ki_filp = fget(iocb->aio_fildes);
-       if (unlikely(!req->ki_filp)) {
+       req->common.ki_filp = fget(iocb->aio_fildes);
+       if (unlikely(!req->common.ki_filp)) {
                 ret = -EBADF;
                 goto out_put_req;
         }
+       req->common.ki_pos = iocb->aio_offset;
+       req->common.ki_complete = aio_complete;
+       req->common.ki_flags = 0;
   
         if (iocb->aio_flags & IOCB_FLAG_RESFD) {
                 /*
@@@ -1542,6 -1558,8 +1558,8 @@@
                         req->ki_eventfd = NULL;
                         goto out_put_req;
                 }
+ 
+               req->common.ki_flags |= IOCB_EVENTFD;
         }
   
         ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
@@@ -1550,13 -1568,12 +1568,12 @@@
                 goto out_put_req;
         }
   
-       req->ki_obj.user = user_iocb;
+       req->ki_user_iocb = user_iocb;
         req->ki_user_data = iocb->aio_data;
-       req->ki_pos = iocb->aio_offset;
-       req->ki_nbytes = iocb->aio_nbytes;
   
-       ret = aio_run_iocb(req, iocb->aio_lio_opcode,
+       ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode,
                            (char __user *)(unsigned long)iocb->aio_buf,
+                          iocb->aio_nbytes,
                            compat);
         if (ret)
                 goto out_put_req;
@@@ -1643,10 -1660,10 +1660,10 @@@ SYSCALL_DEFINE3(io_submit, aio_context_
   /* lookup_kiocb
    *    Finds a given iocb for cancellation.
    */
- static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
-                                 u32 key)
+ static struct aio_kiocb *
+ lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
   {
-       struct list_head *pos;
+       struct aio_kiocb *kiocb;
   
         assert_spin_locked(&ctx->ctx_lock);
   
@@@ -1654,9 -1671,8 +1671,8 @@@
                 return NULL;
   
         /* TODO: use a hash or array, this sucks. */
-       list_for_each(pos, &ctx->active_reqs) {
-               struct kiocb *kiocb = list_kiocb(pos);
-               if (kiocb->ki_obj.user == iocb)
+       list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
+               if (kiocb->ki_user_iocb == iocb)
                         return kiocb;
         }
         return NULL;
@@@ -1676,7 -1692,7 +1692,7 @@@ SYSCALL_DEFINE3(io_cancel, aio_context_
                 struct io_event __user *, result)
   {
         struct kioctx *ctx;
-       struct kiocb *kiocb;
+       struct aio_kiocb *kiocb;
         u32 key;
         int ret;
   
diff --combined fs/btrfs/file.c

index 30982bb,69c9508..aee18f8
--- 1/fs/btrfs/file.c
--- 2/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@@ -24,7 -24,6 +24,6 @@@
   #include <linux/string.h>
   #include <linux/backing-dev.h>
   #include <linux/mpage.h>
- #include <linux/aio.h>
   #include <linux/falloc.h>
   #include <linux/swap.h>
   #include <linux/writeback.h>
@@@ -32,6 -31,7 +31,7 @@@
   #include <linux/compat.h>
   #include <linux/slab.h>
   #include <linux/btrfs.h>
+ #include <linux/uio.h>
   #include "ctree.h"
   #include "disk-io.h"
   #include "transaction.h"
@@@ -1811,10 -1811,22 +1811,10 @@@ static ssize_t btrfs_file_write_iter(st
         mutex_unlock(&inode->i_mutex);
   
         /*
- -       * we want to make sure fsync finds this change
- -       * but we haven't joined a transaction running right now.
- -       *
- -       * Later on, someone is sure to update the inode and get the
- -       * real transid recorded.
- -       *
- -       * We set last_trans now to the fs_info generation + 1,
- -       * this will either be one more than the running transaction
- -       * or the generation used for the next transaction if there isn't
- -       * one running right now.
- -       *
          * We also have to set last_sub_trans to the current log transid,
          * otherwise subsequent syncs to a file that's been synced in this
          * transaction will appear to have already occured.
          */
- -      BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
         BTRFS_I(inode)->last_sub_trans = root->log_transid;
         if (num_written > 0) {
                 err = generic_write_sync(file, pos, num_written);
@@@ -1947,37 -1959,25 +1947,37 @@@ int btrfs_sync_file(struct file *file, 
         atomic_inc(&root->log_batch);
   
         /*
- -       * check the transaction that last modified this inode
- -       * and see if its already been committed
- -       */
- -      if (!BTRFS_I(inode)->last_trans) {
- -              mutex_unlock(&inode->i_mutex);
- -              goto out;
- -      }
- -
- -      /*
- -       * if the last transaction that changed this file was before
- -       * the current transaction, we can bail out now without any
- -       * syncing
+ +       * If the last transaction that changed this file was before the current
+ +       * transaction and we have the full sync flag set in our inode, we can
+ +       * bail out now without any syncing.
+ +       *
+ +       * Note that we can't bail out if the full sync flag isn't set. This is
+ +       * because when the full sync flag is set we start all ordered extents
+ +       * and wait for them to fully complete - when they complete they update
+ +       * the inode's last_trans field through:
+ +       *
+ +       *     btrfs_finish_ordered_io() ->
+ +       *         btrfs_update_inode_fallback() ->
+ +       *             btrfs_update_inode() ->
+ +       *                 btrfs_set_inode_last_trans()
+ +       *
+ +       * So we are sure that last_trans is up to date and can do this check to
+ +       * bail out safely. For the fast path, when the full sync flag is not
+ +       * set in our inode, we can not do it because we start only our ordered
+ +       * extents and don't wait for them to complete (that is when
+ +       * btrfs_finish_ordered_io runs), so here at this point their last_trans
+ +       * value might be less than or equals to fs_info->last_trans_committed,
+ +       * and setting a speculative last_trans for an inode when a buffered
+ +       * write is made (such as fs_info->generation + 1 for example) would not
+ +       * be reliable since after setting the value and before fsync is called
+ +       * any number of transactions can start and commit (transaction kthread
+ +       * commits the current transaction periodically), and a transaction
+ +       * commit does not start nor waits for ordered extents to complete.
          */
         smp_mb();
         if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
- -          BTRFS_I(inode)->last_trans <=
- -          root->fs_info->last_trans_committed) {
- -              BTRFS_I(inode)->last_trans = 0;
- -
+ +          (full_sync && BTRFS_I(inode)->last_trans <=
+ +           root->fs_info->last_trans_committed)) {
                 /*
                  * We'v had everything committed since the last time we were
                  * modified so clear this flag in case it was set for whatever
@@@ -2275,8 -2275,6 +2275,8 @@@ static int btrfs_punch_hole(struct inod
         bool same_page;
         bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
         u64 ino_size;
+ +      bool truncated_page = false;
+ +      bool updated_inode = false;
   
         ret = btrfs_wait_ordered_range(inode, offset, len);
         if (ret)
@@@ -2308,18 -2306,13 +2308,18 @@@
          * entire page.
          */
         if (same_page && len < PAGE_CACHE_SIZE) {
- -              if (offset < ino_size)
+ +              if (offset < ino_size) {
+ +                      truncated_page = true;
                         ret = btrfs_truncate_page(inode, offset, len, 0);
+ +              } else {
+ +                      ret = 0;
+ +              }
                 goto out_only_mutex;
         }
   
         /* zero back part of the first page */
         if (offset < ino_size) {
+ +              truncated_page = true;
                 ret = btrfs_truncate_page(inode, offset, 0, 0);
                 if (ret) {
                         mutex_unlock(&inode->i_mutex);
@@@ -2355,7 -2348,6 +2355,7 @@@
                 if (!ret) {
                         /* zero the front end of the last page */
                         if (tail_start + tail_len < ino_size) {
+ +                              truncated_page = true;
                                 ret = btrfs_truncate_page(inode,
                                                 tail_start + tail_len, 0, 1);
                                 if (ret)
@@@ -2365,8 -2357,8 +2365,8 @@@
         }
   
         if (lockend < lockstart) {
- -              mutex_unlock(&inode->i_mutex);
- -              return 0;
+ +              ret = 0;
+ +              goto out_only_mutex;
         }
   
         while (1) {
@@@ -2514,7 -2506,6 +2514,7 @@@ out_trans
   
         trans->block_rsv = &root->fs_info->trans_block_rsv;
         ret = btrfs_update_inode(trans, root, inode);
+ +      updated_inode = true;
         btrfs_end_transaction(trans, root);
         btrfs_btree_balance_dirty(root);
   out_free:
@@@ -2524,22 -2515,6 +2524,22 @@@ out
         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                              &cached_state, GFP_NOFS);
   out_only_mutex:
+ +      if (!updated_inode && truncated_page && !ret && !err) {
+ +              /*
+ +               * If we only end up zeroing part of a page, we still need to
+ +               * update the inode item, so that all the time fields are
+ +               * updated as well as the necessary btrfs inode in memory fields
+ +               * for detecting, at fsync time, if the inode isn't yet in the
+ +               * log tree or it's there but not up to date.
+ +               */
+ +              trans = btrfs_start_transaction(root, 1);
+ +              if (IS_ERR(trans)) {
+ +                      err = PTR_ERR(trans);
+ +              } else {
+ +                      err = btrfs_update_inode(trans, root, inode);
+ +                      ret = btrfs_end_transaction(trans, root);
+ +              }
+ +      }
         mutex_unlock(&inode->i_mutex);
         if (ret && !err)
                 err = ret;
diff --combined fs/btrfs/inode.c

index d2e732d,b214ab1..686331f
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -32,7 -32,6 +32,6 @@@
   #include <linux/writeback.h>
   #include <linux/statfs.h>
   #include <linux/compat.h>
- #include <linux/aio.h>
   #include <linux/bit_spinlock.h>
   #include <linux/xattr.h>
   #include <linux/posix_acl.h>
@@@ -43,6 -42,7 +42,7 @@@
   #include <linux/btrfs.h>
   #include <linux/blkdev.h>
   #include <linux/posix_acl_xattr.h>
+ #include <linux/uio.h>
   #include "ctree.h"
   #include "disk-io.h"
   #include "transaction.h"
@@@ -108,13 -108,6 +108,13 @@@ static struct extent_map *create_pinned
   
   static int btrfs_dirty_inode(struct inode *inode);
   
+ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ +void btrfs_test_inode_set_ops(struct inode *inode)
+ +{
+ +      BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ +}
+ +#endif
+ +
   static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                      struct inode *inode,  struct inode *dir,
                                      const struct qstr *qstr)
@@@ -1537,32 -1530,10 +1537,32 @@@ static int run_delalloc_range(struct in
   static void btrfs_split_extent_hook(struct inode *inode,
                                     struct extent_state *orig, u64 split)
   {
+ +      u64 size;
+ +
         /* not delalloc, ignore it */
         if (!(orig->state & EXTENT_DELALLOC))
                 return;
   
+ +      size = orig->end - orig->start + 1;
+ +      if (size > BTRFS_MAX_EXTENT_SIZE) {
+ +              u64 num_extents;
+ +              u64 new_size;
+ +
+ +              /*
+ +               * See the explanation in btrfs_merge_extent_hook, the same
+ +               * applies here, just in reverse.
+ +               */
+ +              new_size = orig->end - split + 1;
+ +              num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ +                                      BTRFS_MAX_EXTENT_SIZE);
+ +              new_size = split - orig->start;
+ +              num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ +                                      BTRFS_MAX_EXTENT_SIZE);
+ +              if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+ +                            BTRFS_MAX_EXTENT_SIZE) >= num_extents)
+ +                      return;
+ +      }
+ +
         spin_lock(&BTRFS_I(inode)->lock);
         BTRFS_I(inode)->outstanding_extents++;
         spin_unlock(&BTRFS_I(inode)->lock);
@@@ -1578,55 -1549,10 +1578,55 @@@ static void btrfs_merge_extent_hook(str
                                     struct extent_state *new,
                                     struct extent_state *other)
   {
+ +      u64 new_size, old_size;
+ +      u64 num_extents;
+ +
         /* not delalloc, ignore it */
         if (!(other->state & EXTENT_DELALLOC))
                 return;
   
+ +      if (new->start > other->start)
+ +              new_size = new->end - other->start + 1;
+ +      else
+ +              new_size = other->end - new->start + 1;
+ +
+ +      /* we're not bigger than the max, unreserve the space and go */
+ +      if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+ +              spin_lock(&BTRFS_I(inode)->lock);
+ +              BTRFS_I(inode)->outstanding_extents--;
+ +              spin_unlock(&BTRFS_I(inode)->lock);
+ +              return;
+ +      }
+ +
+ +      /*
+ +       * We have to add up either side to figure out how many extents were
+ +       * accounted for before we merged into one big extent.  If the number of
+ +       * extents we accounted for is <= the amount we need for the new range
+ +       * then we can return, otherwise drop.  Think of it like this
+ +       *
+ +       * [ 4k][MAX_SIZE]
+ +       *
+ +       * So we've grown the extent by a MAX_SIZE extent, this would mean we
+ +       * need 2 outstanding extents, on one side we have 1 and the other side
+ +       * we have 1 so they are == and we can return.  But in this case
+ +       *
+ +       * [MAX_SIZE+4k][MAX_SIZE+4k]
+ +       *
+ +       * Each range on their own accounts for 2 extents, but merged together
+ +       * they are only 3 extents worth of accounting, so we need to drop in
+ +       * this case.
+ +       */
+ +      old_size = other->end - other->start + 1;
+ +      num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ +                              BTRFS_MAX_EXTENT_SIZE);
+ +      old_size = new->end - new->start + 1;
+ +      num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ +                               BTRFS_MAX_EXTENT_SIZE);
+ +
+ +      if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ +                    BTRFS_MAX_EXTENT_SIZE) >= num_extents)
+ +              return;
+ +
         spin_lock(&BTRFS_I(inode)->lock);
         BTRFS_I(inode)->outstanding_extents--;
         spin_unlock(&BTRFS_I(inode)->lock);
@@@ -1678,7 -1604,7 +1678,7 @@@ static void btrfs_del_delalloc_inode(st
    * have pending delalloc work to be done.
    */
   static void btrfs_set_bit_hook(struct inode *inode,
- -                             struct extent_state *state, unsigned long *bits)
+ +                             struct extent_state *state, unsigned *bits)
   {
   
         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
@@@ -1701,10 -1627,6 +1701,10 @@@
                         spin_unlock(&BTRFS_I(inode)->lock);
                 }
   
+ +              /* For sanity tests */
+ +              if (btrfs_test_is_dummy_root(root))
+ +                      return;
+ +
                 __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
                                      root->fs_info->delalloc_batch);
                 spin_lock(&BTRFS_I(inode)->lock);
@@@ -1723,11 -1645,9 +1723,11 @@@
    */
   static void btrfs_clear_bit_hook(struct inode *inode,
                                  struct extent_state *state,
- -                               unsigned long *bits)
+ +                               unsigned *bits)
   {
         u64 len = state->end + 1 - state->start;
+ +      u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
+ +                                  BTRFS_MAX_EXTENT_SIZE);
   
         spin_lock(&BTRFS_I(inode)->lock);
         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
@@@ -1747,7 -1667,7 +1747,7 @@@
                         *bits &= ~EXTENT_FIRST_DELALLOC;
                 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
                         spin_lock(&BTRFS_I(inode)->lock);
- -                      BTRFS_I(inode)->outstanding_extents--;
+ +                      BTRFS_I(inode)->outstanding_extents -= num_extents;
                         spin_unlock(&BTRFS_I(inode)->lock);
                 }
   
@@@ -1760,10 -1680,6 +1760,10 @@@
                     root != root->fs_info->tree_root)
                         btrfs_delalloc_release_metadata(inode, len);
   
+ +              /* For sanity tests. */
+ +              if (btrfs_test_is_dummy_root(root))
+ +                      return;
+ +
                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
                     && do_list && !(state->state & EXTENT_NORESERVE))
                         btrfs_free_reserved_data_space(inode, len);
@@@ -3029,7 -2945,7 +3029,7 @@@ static int __readpage_endio_check(struc
         return 0;
   zeroit:
         if (__ratelimit(&_rs))
- -              btrfs_info(BTRFS_I(inode)->root->fs_info,
+ +              btrfs_warn(BTRFS_I(inode)->root->fs_info,
                            "csum failed ino %llu off %llu csum %u expected csum %u",
                            btrfs_ino(inode), start, csum, csum_expected);
         memset(kaddr + pgoff, 1, len);
@@@ -3491,7 -3407,7 +3491,7 @@@ int btrfs_orphan_cleanup(struct btrfs_r
   
   out:
         if (ret)
- -              btrfs_crit(root->fs_info,
+ +              btrfs_err(root->fs_info,
                         "could not do orphan cleanup %d", ret);
         btrfs_free_path(path);
         return ret;
@@@ -3574,6 -3490,7 +3574,6 @@@ static void btrfs_read_locked_inode(str
         struct btrfs_path *path;
         struct extent_buffer *leaf;
         struct btrfs_inode_item *inode_item;
- -      struct btrfs_timespec *tspec;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_key location;
         unsigned long ptr;
@@@ -3610,19 -3527,17 +3610,19 @@@
         i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
         btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
   
- -      tspec = btrfs_inode_atime(inode_item);
- -      inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
- -      inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+ +      inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
+ +      inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
   
- -      tspec = btrfs_inode_mtime(inode_item);
- -      inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
- -      inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+ +      inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
+ +      inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
   
- -      tspec = btrfs_inode_ctime(inode_item);
- -      inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
- -      inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+ +      inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
+ +      inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
+ +
+ +      BTRFS_I(inode)->i_otime.tv_sec =
+ +              btrfs_timespec_sec(leaf, &inode_item->otime);
+ +      BTRFS_I(inode)->i_otime.tv_nsec =
+ +              btrfs_timespec_nsec(leaf, &inode_item->otime);
   
         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@@ -3741,26 -3656,21 +3741,26 @@@ static void fill_inode_item(struct btrf
         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
   
- -      btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+ +      btrfs_set_token_timespec_sec(leaf, &item->atime,
                                      inode->i_atime.tv_sec, &token);
- -      btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+ +      btrfs_set_token_timespec_nsec(leaf, &item->atime,
                                       inode->i_atime.tv_nsec, &token);
   
- -      btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+ +      btrfs_set_token_timespec_sec(leaf, &item->mtime,
                                      inode->i_mtime.tv_sec, &token);
- -      btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+ +      btrfs_set_token_timespec_nsec(leaf, &item->mtime,
                                       inode->i_mtime.tv_nsec, &token);
   
- -      btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+ +      btrfs_set_token_timespec_sec(leaf, &item->ctime,
                                      inode->i_ctime.tv_sec, &token);
- -      btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+ +      btrfs_set_token_timespec_nsec(leaf, &item->ctime,
                                       inode->i_ctime.tv_nsec, &token);
   
+ +      btrfs_set_token_timespec_sec(leaf, &item->otime,
+ +                                   BTRFS_I(inode)->i_otime.tv_sec, &token);
+ +      btrfs_set_token_timespec_nsec(leaf, &item->otime,
+ +                                    BTRFS_I(inode)->i_otime.tv_nsec, &token);
+ +
         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
                                      &token);
         btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
@@@ -5097,7 -5007,6 +5097,7 @@@ static int fixup_tree_root_location(str
         struct btrfs_root *new_root;
         struct btrfs_root_ref *ref;
         struct extent_buffer *leaf;
+ +      struct btrfs_key key;
         int ret;
         int err = 0;
   
@@@ -5108,12 -5017,9 +5108,12 @@@
         }
   
         err = -ENOENT;
- -      ret = btrfs_find_item(root->fs_info->tree_root, path,
- -                              BTRFS_I(dir)->root->root_key.objectid,
- -                              location->objectid, BTRFS_ROOT_REF_KEY, NULL);
+ +      key.objectid = BTRFS_I(dir)->root->root_key.objectid;
+ +      key.type = BTRFS_ROOT_REF_KEY;
+ +      key.offset = location->objectid;
+ +
+ +      ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
+ +                              0, 0);
         if (ret) {
                 if (ret < 0)
                         err = ret;
@@@ -5352,10 -5258,7 +5352,10 @@@ static struct inode *new_simple_dir(str
         inode->i_op = &btrfs_dir_ro_inode_operations;
         inode->i_fop = &simple_dir_operations;
         inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- -      inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ +      inode->i_mtime = CURRENT_TIME;
+ +      inode->i_atime = inode->i_mtime;
+ +      inode->i_ctime = inode->i_mtime;
+ +      BTRFS_I(inode)->i_otime = inode->i_mtime;
   
         return inode;
   }
@@@ -5923,12 -5826,7 +5923,12 @@@ static struct inode *btrfs_new_inode(st
   
         inode_init_owner(inode, dir, mode);
         inode_set_bytes(inode, 0);
- -      inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ +
+ +      inode->i_mtime = CURRENT_TIME;
+ +      inode->i_atime = inode->i_mtime;
+ +      inode->i_ctime = inode->i_mtime;
+ +      BTRFS_I(inode)->i_otime = inode->i_mtime;
+ +
         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                   struct btrfs_inode_item);
         memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
@@@ -7236,28 -7134,17 +7236,28 @@@ static int btrfs_get_blocks_direct(stru
         u64 start = iblock << inode->i_blkbits;
         u64 lockstart, lockend;
         u64 len = bh_result->b_size;
+ +      u64 *outstanding_extents = NULL;
         int unlock_bits = EXTENT_LOCKED;
         int ret = 0;
   
         if (create)
- -              unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+ +              unlock_bits |= EXTENT_DIRTY;
         else
                 len = min_t(u64, len, root->sectorsize);
   
         lockstart = start;
         lockend = start + len - 1;
   
+ +      if (current->journal_info) {
+ +              /*
+ +               * Need to pull our outstanding extents and set journal_info to NULL so
+ +               * that anything that needs to check if there's a transction doesn't get
+ +               * confused.
+ +               */
+ +              outstanding_extents = current->journal_info;
+ +              current->journal_info = NULL;
+ +      }
+ +
         /*
          * If this errors out it's because we couldn't invalidate pagecache for
          * this range and we need to fallback to buffered.
@@@ -7318,6 -7205,7 +7318,6 @@@
             ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
              em->block_start != EXTENT_MAP_HOLE)) {
                 int type;
- -              int ret;
                 u64 block_start, orig_start, orig_block_len, ram_bytes;
   
                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
@@@ -7381,21 -7269,14 +7381,21 @@@ unlock
                 if (start + len > i_size_read(inode))
                         i_size_write(inode, start + len);
   
- -              spin_lock(&BTRFS_I(inode)->lock);
- -              BTRFS_I(inode)->outstanding_extents++;
- -              spin_unlock(&BTRFS_I(inode)->lock);
+ +              /*
+ +               * If we have an outstanding_extents count still set then we're
+ +               * within our reservation, otherwise we need to adjust our inode
+ +               * counter appropriately.
+ +               */
+ +              if (*outstanding_extents) {
+ +                      (*outstanding_extents)--;
+ +              } else {
+ +                      spin_lock(&BTRFS_I(inode)->lock);
+ +                      BTRFS_I(inode)->outstanding_extents++;
+ +                      spin_unlock(&BTRFS_I(inode)->lock);
+ +              }
   
- -              ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- -                                   lockstart + len - 1, EXTENT_DELALLOC, NULL,
- -                                   &cached_state, GFP_NOFS);
- -              BUG_ON(ret);
+ +              current->journal_info = outstanding_extents;
+ +              btrfs_free_reserved_data_space(inode, len);
         }
   
         /*
@@@ -7418,8 -7299,6 +7418,8 @@@
   unlock_err:
         clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                          unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+ +      if (outstanding_extents)
+ +              current->journal_info = outstanding_extents;
         return ret;
   }
   
@@@ -7926,7 -7805,8 +7926,7 @@@ static int btrfs_submit_direct_hook(in
         }
   
         /* async crcs make it difficult to collect full stripe writes. */
- -      if (btrfs_get_alloc_profile(root, 1) &
- -          (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
+ +      if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
                 async_submit = 0;
         else
                 async_submit = 1;
@@@ -8119,7 -7999,6 +8119,7 @@@ static ssize_t btrfs_direct_IO(int rw, 
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
+ +      u64 outstanding_extents = 0;
         size_t count = 0;
         int flags = 0;
         bool wakeup = true;
@@@ -8157,16 -8036,6 +8157,16 @@@
                 ret = btrfs_delalloc_reserve_space(inode, count);
                 if (ret)
                         goto out;
+ +              outstanding_extents = div64_u64(count +
+ +                                              BTRFS_MAX_EXTENT_SIZE - 1,
+ +                                              BTRFS_MAX_EXTENT_SIZE);
+ +
+ +              /*
+ +               * We need to know how many extents we reserved so that we can
+ +               * do the accounting properly if we go over the number we
+ +               * originally calculated.  Abuse current->journal_info for this.
+ +               */
+ +              current->journal_info = &outstanding_extents;
         } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
                                      &BTRFS_I(inode)->runtime_flags)) {
                 inode_dio_done(inode);
@@@ -8179,12 -8048,13 +8179,12 @@@
                         iter, offset, btrfs_get_blocks_direct, NULL,
                         btrfs_submit_direct, flags);
         if (rw & WRITE) {
+ +              current->journal_info = NULL;
                 if (ret < 0 && ret != -EIOCBQUEUED)
                         btrfs_delalloc_release_space(inode, count);
                 else if (ret >= 0 && (size_t)ret < count)
                         btrfs_delalloc_release_space(inode,
                                                      count - (size_t)ret);
- -              else
- -                      btrfs_delalloc_release_metadata(inode, 0);
         }
   out:
         if (wakeup)
@@@ -8705,9 -8575,6 +8705,9 @@@ struct inode *btrfs_alloc_inode(struct 
   
         ei->delayed_node = NULL;
   
+ +      ei->i_otime.tv_sec = 0;
+ +      ei->i_otime.tv_nsec = 0;
+ +
         inode = &ei->vfs_inode;
         extent_map_tree_init(&ei->extent_tree);
         extent_io_tree_init(&ei->io_tree, &inode->i_data);
diff --combined fs/ceph/file.c

index d533075,98e257c..139f2fe
--- 1/fs/ceph/file.c
--- 2/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@@ -7,7 -7,6 +7,6 @@@
   #include <linux/mount.h>
   #include <linux/namei.h>
   #include <linux/writeback.h>
- #include <linux/aio.h>
   #include <linux/falloc.h>
   
   #include "super.h"
@@@ -275,10 -274,10 +274,10 @@@ int ceph_atomic_open(struct inode *dir
         err = ceph_mdsc_do_request(mdsc,
                                    (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
                                    req);
+ +      err = ceph_handle_snapdir(req, dentry, err);
         if (err)
                 goto out_req;
   
- -      err = ceph_handle_snapdir(req, dentry, err);
         if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
                 err = ceph_handle_notrace_create(dir, dentry);
   
@@@ -292,7 -291,7 +291,7 @@@
         }
         if (err)
                 goto out_req;
- -      if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) {
+ +      if (dn || dentry->d_inode == NULL || d_is_symlink(dentry)) {
                 /* make vfs retry on splice, ENOENT, or symlink */
                 dout("atomic_open finish_no_open on dn %p\n", dn);
                 err = finish_no_open(file, dn);
@@@ -392,14 -391,13 +391,14 @@@ more
         if (ret >= 0) {
                 int didpages;
                 if (was_short && (pos + ret < inode->i_size)) {
- -                      u64 tmp = min(this_len - ret,
- -                                      inode->i_size - pos - ret);
+ +                      int zlen = min(this_len - ret,
+ +                                     inode->i_size - pos - ret);
+ +                      int zoff = (o_direct ? buf_align : io_align) +
+ +                                  read + ret;
                         dout(" zero gap %llu to %llu\n",
- -                              pos + ret, pos + ret + tmp);
- -                      ceph_zero_page_vector_range(page_align + read + ret,
- -                                                      tmp, pages);
- -                      ret += tmp;
+ +                              pos + ret, pos + ret + zlen);
+ +                      ceph_zero_page_vector_range(zoff, zlen, pages);
+ +                      ret += zlen;
                 }
   
                 didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
@@@ -808,7 -806,7 +807,7 @@@ static ssize_t ceph_read_iter(struct ki
   {
         struct file *filp = iocb->ki_filp;
         struct ceph_file_info *fi = filp->private_data;
-       size_t len = iocb->ki_nbytes;
+       size_t len = iov_iter_count(to);
         struct inode *inode = file_inode(filp);
         struct ceph_inode_info *ci = ceph_inode(inode);
         struct page *pinned_page = NULL;
@@@ -879,34 -877,28 +878,34 @@@ again
   
                 i_size = i_size_read(inode);
                 if (retry_op == READ_INLINE) {
- -                      /* does not support inline data > PAGE_SIZE */
- -                      if (i_size > PAGE_CACHE_SIZE) {
- -                              ret = -EIO;
- -                      } else if (iocb->ki_pos < i_size) {
+ +                      BUG_ON(ret > 0 || read > 0);
+ +                      if (iocb->ki_pos < i_size &&
+ +                          iocb->ki_pos < PAGE_CACHE_SIZE) {
                                 loff_t end = min_t(loff_t, i_size,
                                                    iocb->ki_pos + len);
+ +                              end = min_t(loff_t, end, PAGE_CACHE_SIZE);
                                 if (statret < end)
                                         zero_user_segment(page, statret, end);
                                 ret = copy_page_to_iter(page,
                                                 iocb->ki_pos & ~PAGE_MASK,
                                                 end - iocb->ki_pos, to);
                                 iocb->ki_pos += ret;
- -                      } else {
- -                              ret = 0;
+ +                              read += ret;
+ +                      }
+ +                      if (iocb->ki_pos < i_size && read < len) {
+ +                              size_t zlen = min_t(size_t, len - read,
+ +                                                  i_size - iocb->ki_pos);
+ +                              ret = iov_iter_zero(zlen, to);
+ +                              iocb->ki_pos += ret;
+ +                              read += ret;
                         }
                         __free_pages(page, 0);
- -                      return ret;
+ +                      return read;
                 }
   
                 /* hit EOF or hole? */
                 if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
- -                      ret < len) {
+ +                  ret < len) {
                         dout("sync_read hit hole, ppos %lld < size %lld"
                              ", reading more\n", iocb->ki_pos,
                              inode->i_size);
diff --combined fs/ecryptfs/file.c

index fd39bad,273d36e..7967508
--- 1/fs/ecryptfs/file.c
--- 2/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@@ -31,7 -31,6 +31,6 @@@
   #include <linux/security.h>
   #include <linux/compat.h>
   #include <linux/fs_stack.h>
- #include <linux/aio.h>
   #include "ecryptfs_kernel.h"
   
   /**
@@@ -52,12 -51,6 +51,6 @@@ static ssize_t ecryptfs_read_update_ati
         struct file *file = iocb->ki_filp;
   
         rc = generic_file_read_iter(iocb, to);
-       /*
-        * Even though this is a async interface, we need to wait
-        * for IO to finish to update atime
-        */
-       if (-EIOCBQUEUED == rc)
-               rc = wait_on_sync_kiocb(iocb);
         if (rc >= 0) {
                 path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
                 touch_atime(path);
@@@ -230,7 -223,7 +223,7 @@@ static int ecryptfs_open(struct inode *
         }
         ecryptfs_set_file_lower(
                 file, ecryptfs_inode_to_private(inode)->lower_file);
- -      if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+ +      if (d_is_dir(ecryptfs_dentry)) {
                 ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
                 mutex_lock(&crypt_stat->cs_mutex);
                 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
@@@ -303,22 -296,9 +296,22 @@@ ecryptfs_unlocked_ioctl(struct file *fi
         struct file *lower_file = ecryptfs_file_to_lower(file);
         long rc = -ENOTTY;
   
- -      if (lower_file->f_op->unlocked_ioctl)
+ +      if (!lower_file->f_op->unlocked_ioctl)
+ +              return rc;
+ +
+ +      switch (cmd) {
+ +      case FITRIM:
+ +      case FS_IOC_GETFLAGS:
+ +      case FS_IOC_SETFLAGS:
+ +      case FS_IOC_GETVERSION:
+ +      case FS_IOC_SETVERSION:
                 rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
- -      return rc;
+ +              fsstack_copy_attr_all(file_inode(file), file_inode(lower_file));
+ +
+ +              return rc;
+ +      default:
+ +              return rc;
+ +      }
   }
   
   #ifdef CONFIG_COMPAT
@@@ -328,22 -308,9 +321,22 @@@ ecryptfs_compat_ioctl(struct file *file
         struct file *lower_file = ecryptfs_file_to_lower(file);
         long rc = -ENOIOCTLCMD;
   
- -      if (lower_file->f_op->compat_ioctl)
+ +      if (!lower_file->f_op->compat_ioctl)
+ +              return rc;
+ +
+ +      switch (cmd) {
+ +      case FITRIM:
+ +      case FS_IOC32_GETFLAGS:
+ +      case FS_IOC32_SETFLAGS:
+ +      case FS_IOC32_GETVERSION:
+ +      case FS_IOC32_SETVERSION:
                 rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
- -      return rc;
+ +              fsstack_copy_attr_all(file_inode(file), file_inode(lower_file));
+ +
+ +              return rc;
+ +      default:
+ +              return rc;
+ +      }
   }
   #endif
   
diff --combined fs/ext4/indirect.c

index 45fe924,8611640..740c787
--- 1/fs/ext4/indirect.c
--- 2/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@@ -20,9 -20,9 +20,9 @@@
    *    (sct@redhat.com), 1993, 1998
    */
   
- #include <linux/aio.h>
   #include "ext4_jbd2.h"
   #include "truncate.h"
+ #include <linux/uio.h>
   
   #include <trace/events/ext4.h>
   
@@@ -1401,7 -1401,10 +1401,7 @@@ end_range
                                  * to free. Everything was covered by the start
                                  * of the range.
                                  */
- -                              return 0;
- -                      } else {
- -                              /* Shared branch grows from an indirect block */
- -                              partial2--;
+ +                              goto do_indirects;
                         }
                 } else {
                         /*
@@@ -1432,96 -1435,56 +1432,96 @@@
         /* Punch happened within the same level (n == n2) */
         partial = ext4_find_shared(inode, n, offsets, chain, &nr);
         partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
- -      /*
- -       * ext4_find_shared returns Indirect structure which
- -       * points to the last element which should not be
- -       * removed by truncate. But this is end of the range
- -       * in punch_hole so we need to point to the next element
- -       */
- -      partial2->p++;
- -      while ((partial > chain) || (partial2 > chain2)) {
- -              /* We're at the same block, so we're almost finished */
- -              if ((partial->bh && partial2->bh) &&
- -                  (partial->bh->b_blocknr == partial2->bh->b_blocknr)) {
- -                      if ((partial > chain) && (partial2 > chain2)) {
+ +
+ +      /* Free top, but only if partial2 isn't its subtree. */
+ +      if (nr) {
+ +              int level = min(partial - chain, partial2 - chain2);
+ +              int i;
+ +              int subtree = 1;
+ +
+ +              for (i = 0; i <= level; i++) {
+ +                      if (offsets[i] != offsets2[i]) {
+ +                              subtree = 0;
+ +                              break;
+ +                      }
+ +              }
+ +
+ +              if (!subtree) {
+ +                      if (partial == chain) {
+ +                              /* Shared branch grows from the inode */
+ +                              ext4_free_branches(handle, inode, NULL,
+ +                                                 &nr, &nr+1,
+ +                                                 (chain+n-1) - partial);
+ +                              *partial->p = 0;
+ +                      } else {
+ +                              /* Shared branch grows from an indirect block */
+ +                              BUFFER_TRACE(partial->bh, "get_write_access");
                                 ext4_free_branches(handle, inode, partial->bh,
- -                                                 partial->p + 1,
- -                                                 partial2->p,
+ +                                                 partial->p,
+ +                                                 partial->p+1,
                                                    (chain+n-1) - partial);
- -                              BUFFER_TRACE(partial->bh, "call brelse");
- -                              brelse(partial->bh);
- -                              BUFFER_TRACE(partial2->bh, "call brelse");
- -                              brelse(partial2->bh);
                         }
- -                      return 0;
                 }
+ +      }
+ +
+ +      if (!nr2) {
                 /*
- -               * Clear the ends of indirect blocks on the shared branch
- -               * at the start of the range
+ +               * ext4_find_shared returns Indirect structure which
+ +               * points to the last element which should not be
+ +               * removed by truncate. But this is end of the range
+ +               * in punch_hole so we need to point to the next element
                  */
- -              if (partial > chain) {
+ +              partial2->p++;
+ +      }
+ +
+ +      while (partial > chain || partial2 > chain2) {
+ +              int depth = (chain+n-1) - partial;
+ +              int depth2 = (chain2+n2-1) - partial2;
+ +
+ +              if (partial > chain && partial2 > chain2 &&
+ +                  partial->bh->b_blocknr == partial2->bh->b_blocknr) {
+ +                      /*
+ +                       * We've converged on the same block. Clear the range,
+ +                       * then we're done.
+ +                       */
                         ext4_free_branches(handle, inode, partial->bh,
- -                                 partial->p + 1,
- -                                 (__le32 *)partial->bh->b_data+addr_per_block,
- -                                 (chain+n-1) - partial);
+ +                                         partial->p + 1,
+ +                                         partial2->p,
+ +                                         (chain+n-1) - partial);
                         BUFFER_TRACE(partial->bh, "call brelse");
                         brelse(partial->bh);
- -                      partial--;
+ +                      BUFFER_TRACE(partial2->bh, "call brelse");
+ +                      brelse(partial2->bh);
+ +                      return 0;
                 }
+ +
                 /*
- -               * Clear the ends of indirect blocks on the shared branch
- -               * at the end of the range
+ +               * The start and end partial branches may not be at the same
+ +               * level even though the punch happened within one level. So, we
+ +               * give them a chance to arrive at the same level, then walk
+ +               * them in step with each other until we converge on the same
+ +               * block.
                  */
- -              if (partial2 > chain2) {
+ +              if (partial > chain && depth <= depth2) {
+ +                      ext4_free_branches(handle, inode, partial->bh,
+ +                                         partial->p + 1,
+ +                                         (__le32 *)partial->bh->b_data+addr_per_block,
+ +                                         (chain+n-1) - partial);
+ +                      BUFFER_TRACE(partial->bh, "call brelse");
+ +                      brelse(partial->bh);
+ +                      partial--;
+ +              }
+ +              if (partial2 > chain2 && depth2 <= depth) {
                         ext4_free_branches(handle, inode, partial2->bh,
                                            (__le32 *)partial2->bh->b_data,
                                            partial2->p,
- -                                         (chain2+n-1) - partial2);
+ +                                         (chain2+n2-1) - partial2);
                         BUFFER_TRACE(partial2->bh, "call brelse");
                         brelse(partial2->bh);
                         partial2--;
                 }
         }
+ +      return 0;
   
   do_indirects:
         /* Kill the remaining (whole) subtrees */
diff --combined fs/ext4/inode.c

index 5cb9a21,6325d2c..a3f4513
--- 1/fs/ext4/inode.c
--- 2/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@@ -37,7 -37,6 +37,6 @@@
   #include <linux/printk.h>
   #include <linux/slab.h>
   #include <linux/ratelimit.h>
- #include <linux/aio.h>
   #include <linux/bitops.h>
   
   #include "ext4_jbd2.h"
@@@ -1024,7 -1023,6 +1023,7 @@@ static int ext4_write_end(struct file *
   {
         handle_t *handle = ext4_journal_current_handle();
         struct inode *inode = mapping->host;
+ +      loff_t old_size = inode->i_size;
         int ret = 0, ret2;
         int i_size_changed = 0;
   
@@@ -1055,8 -1053,6 +1054,8 @@@
         unlock_page(page);
         page_cache_release(page);
   
+ +      if (old_size < pos)
+ +              pagecache_isize_extended(inode, old_size, pos);
         /*
          * Don't mark the inode dirty under page lock. First, it unnecessarily
          * makes the holding time of page lock longer. Second, it forces lock
@@@ -1098,7 -1094,6 +1097,7 @@@ static int ext4_journalled_write_end(st
   {
         handle_t *handle = ext4_journal_current_handle();
         struct inode *inode = mapping->host;
+ +      loff_t old_size = inode->i_size;
         int ret = 0, ret2;
         int partial = 0;
         unsigned from, to;
@@@ -1131,9 -1126,6 +1130,9 @@@
         unlock_page(page);
         page_cache_release(page);
   
+ +      if (old_size < pos)
+ +              pagecache_isize_extended(inode, old_size, pos);
+ +
         if (size_changed) {
                 ret2 = ext4_mark_inode_dirty(handle, inode);
                 if (!ret)
diff --combined fs/fuse/dev.c

index 39706c5,8c92c72..95a2797
--- 1/fs/fuse/dev.c
--- 2/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@@ -19,7 -19,6 +19,6 @@@
   #include <linux/pipe_fs_i.h>
   #include <linux/swap.h>
   #include <linux/splice.h>
- #include <linux/aio.h>
   
   MODULE_ALIAS_MISCDEV(FUSE_MINOR);
   MODULE_ALIAS("devname:fuse");
@@@ -890,8 -889,8 +889,8 @@@ static int fuse_try_move_page(struct fu
   
         newpage = buf->page;
   
- -      if (WARN_ON(!PageUptodate(newpage)))
- -              return -EIO;
+ +      if (!PageUptodate(newpage))
+ +              SetPageUptodate(newpage);
   
         ClearPageMappedToDisk(newpage);
   
@@@ -1353,17 -1352,6 +1352,17 @@@ static ssize_t fuse_dev_do_read(struct 
         return err;
   }
   
+ +static int fuse_dev_open(struct inode *inode, struct file *file)
+ +{
+ +      /*
+ +       * The fuse device's file's private_data is used to hold
+ +       * the fuse_conn(ection) when it is mounted, and is used to
+ +       * keep track of whether the file has been mounted already.
+ +       */
+ +      file->private_data = NULL;
+ +      return 0;
+ +}
+ +
   static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
                               unsigned long nr_segs, loff_t pos)
   {
@@@ -1808,9 -1796,6 +1807,9 @@@ copy_finish
   static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
                        unsigned int size, struct fuse_copy_state *cs)
   {
+ +      /* Don't try to move pages (yet) */
+ +      cs->move_pages = 0;
+ +
         switch (code) {
         case FUSE_NOTIFY_POLL:
                 return fuse_notify_poll(fc, size, cs);
@@@ -2231,7 -2216,6 +2230,7 @@@ static int fuse_dev_fasync(int fd, stru
   
   const struct file_operations fuse_dev_operations = {
         .owner          = THIS_MODULE,
+ +      .open           = fuse_dev_open,
         .llseek         = no_llseek,
         .read           = do_sync_read,
         .aio_read       = fuse_dev_read,
diff --combined fs/nfs/direct.c

index e907c8c,5db3385..c3929fb
--- 1/fs/nfs/direct.c
--- 2/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@@ -265,7 -265,7 +265,7 @@@ ssize_t nfs_direct_IO(int rw, struct ki
   
         return -EINVAL;
   #else
-       VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
+       VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
   
         if (rw == READ)
                 return nfs_file_direct_read(iocb, iter, pos);
@@@ -283,7 -283,7 +283,7 @@@ static void nfs_direct_release_pages(st
   void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
                               struct nfs_direct_req *dreq)
   {
- -      cinfo->lock = &dreq->lock;
+ +      cinfo->lock = &dreq->inode->i_lock;
         cinfo->mds = &dreq->mds_cinfo;
         cinfo->ds = &dreq->ds_cinfo;
         cinfo->dreq = dreq;
@@@ -393,7 -393,7 +393,7 @@@ static void nfs_direct_complete(struct 
                 long res = (long) dreq->error;
                 if (!res)
                         res = (long) dreq->count;
-               aio_complete(dreq->iocb, res, 0);
+               dreq->iocb->ki_complete(dreq->iocb, res, 0);
         }
   
         complete_all(&dreq->completion);
diff --combined fs/nfs/file.c

index e679d24,5d8b89c..37b1558
--- 1/fs/nfs/file.c
--- 2/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@@ -26,7 -26,6 +26,6 @@@
   #include <linux/nfs_mount.h>
   #include <linux/mm.h>
   #include <linux/pagemap.h>
- #include <linux/aio.h>
   #include <linux/gfp.h>
   #include <linux/swap.h>
   
@@@ -178,7 -177,7 +177,7 @@@ nfs_file_read(struct kiocb *iocb, struc
                 iocb->ki_filp,
                 iov_iter_count(to), (unsigned long) iocb->ki_pos);
   
- -      result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
+ +      result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping);
         if (!result) {
                 result = generic_file_read_iter(iocb, to);
                 if (result > 0)
@@@ -199,7 -198,7 +198,7 @@@ nfs_file_splice_read(struct file *filp
         dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
                 filp, (unsigned long) count, (unsigned long long) *ppos);
   
- -      res = nfs_revalidate_mapping(inode, filp->f_mapping);
+ +      res = nfs_revalidate_mapping_protected(inode, filp->f_mapping);
         if (!res) {
                 res = generic_file_splice_read(filp, ppos, pipe, count, flags);
                 if (res > 0)
@@@ -372,10 -371,6 +371,10 @@@ start
                                  nfs_wait_bit_killable, TASK_KILLABLE);
         if (ret)
                 return ret;
+ +      /*
+ +       * Wait for O_DIRECT to complete
+ +       */
+ +      nfs_inode_dio_wait(mapping->host);
   
         page = grab_cache_page_write_begin(mapping, index, flags);
         if (!page)
@@@ -623,9 -618,6 +622,9 @@@ static int nfs_vm_page_mkwrite(struct v
         /* make sure the cache has finished storing the page */
         nfs_fscache_wait_on_page_write(NFS_I(inode), page);
   
+ +      wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
+ +                      nfs_wait_bit_killable, TASK_KILLABLE);
+ +
         lock_page(page);
         mapping = page_file_mapping(page);
         if (mapping != inode->i_mapping)
diff --combined fs/ntfs/file.c

index 29139ff,f16f2d8..c1da78d
--- 1/fs/ntfs/file.c
--- 2/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@@ -1,7 -1,7 +1,7 @@@
   /*
    * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
    *
- - * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
+ + * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
    *
    * This program/include file is free software; you can redistribute it and/or
    * modify it under the terms of the GNU General Public License as published
@@@ -28,7 -28,6 +28,6 @@@
   #include <linux/swap.h>
   #include <linux/uio.h>
   #include <linux/writeback.h>
- #include <linux/aio.h>
   
   #include <asm/page.h>
   #include <asm/uaccess.h>
@@@ -329,168 -328,62 +328,168 @@@ err_out
         return err;
   }
   
- -/**
- - * ntfs_fault_in_pages_readable -
- - *
- - * Fault a number of userspace pages into pagetables.
- - *
- - * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
- - * with more than two userspace pages as well as handling the single page case
- - * elegantly.
- - *
- - * If you find this difficult to understand, then think of the while loop being
- - * the following code, except that we do without the integer variable ret:
- - *
- - *    do {
- - *            ret = __get_user(c, uaddr);
- - *            uaddr += PAGE_SIZE;
- - *    } while (!ret && uaddr < end);
- - *
- - * Note, the final __get_user() may well run out-of-bounds of the user buffer,
- - * but _not_ out-of-bounds of the page the user buffer belongs to, and since
- - * this is only a read and not a write, and since it is still in the same page,
- - * it should not matter and this makes the code much simpler.
- - */
- -static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
- -              int bytes)
+ +static ssize_t ntfs_prepare_file_for_write(struct file *file, loff_t *ppos,
+ +              size_t *count)
   {
- -      const char __user *end;
- -      volatile char c;
- -
- -      /* Set @end to the first byte outside the last page we care about. */
- -      end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
- -
- -      while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
- -              ;
- -}
- -
- -/**
- - * ntfs_fault_in_pages_readable_iovec -
- - *
- - * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
- - */
- -static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
- -              size_t iov_ofs, int bytes)
- -{
- -      do {
- -              const char __user *buf;
- -              unsigned len;
+ +      loff_t pos;
+ +      s64 end, ll;
+ +      ssize_t err;
+ +      unsigned long flags;
+ +      struct inode *vi = file_inode(file);
+ +      ntfs_inode *base_ni, *ni = NTFS_I(vi);
+ +      ntfs_volume *vol = ni->vol;
   
- -              buf = iov->iov_base + iov_ofs;
- -              len = iov->iov_len - iov_ofs;
- -              if (len > bytes)
- -                      len = bytes;
- -              ntfs_fault_in_pages_readable(buf, len);
- -              bytes -= len;
- -              iov++;
- -              iov_ofs = 0;
- -      } while (bytes);
+ +      ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
+ +                      "0x%llx, count 0x%lx.", vi->i_ino,
+ +                      (unsigned)le32_to_cpu(ni->type),
+ +                      (unsigned long long)*ppos, (unsigned long)*count);
+ +      /* We can write back this queue in page reclaim. */
+ +      current->backing_dev_info = inode_to_bdi(vi);
+ +      err = generic_write_checks(file, ppos, count, S_ISBLK(vi->i_mode));
+ +      if (unlikely(err))
+ +              goto out;
+ +      /*
+ +       * All checks have passed.  Before we start doing any writing we want
+ +       * to abort any totally illegal writes.
+ +       */
+ +      BUG_ON(NInoMstProtected(ni));
+ +      BUG_ON(ni->type != AT_DATA);
+ +      /* If file is encrypted, deny access, just like NT4. */
+ +      if (NInoEncrypted(ni)) {
+ +              /* Only $DATA attributes can be encrypted. */
+ +              /*
+ +               * Reminder for later: Encrypted files are _always_
+ +               * non-resident so that the content can always be encrypted.
+ +               */
+ +              ntfs_debug("Denying write access to encrypted file.");
+ +              err = -EACCES;
+ +              goto out;
+ +      }
+ +      if (NInoCompressed(ni)) {
+ +              /* Only unnamed $DATA attribute can be compressed. */
+ +              BUG_ON(ni->name_len);
+ +              /*
+ +               * Reminder for later: If resident, the data is not actually
+ +               * compressed.  Only on the switch to non-resident does
+ +               * compression kick in.  This is in contrast to encrypted files
+ +               * (see above).
+ +               */
+ +              ntfs_error(vi->i_sb, "Writing to compressed files is not "
+ +                              "implemented yet.  Sorry.");
+ +              err = -EOPNOTSUPP;
+ +              goto out;
+ +      }
+ +      if (*count == 0)
+ +              goto out;
+ +      base_ni = ni;
+ +      if (NInoAttr(ni))
+ +              base_ni = ni->ext.base_ntfs_ino;
+ +      err = file_remove_suid(file);
+ +      if (unlikely(err))
+ +              goto out;
+ +      /*
+ +       * Our ->update_time method always succeeds thus file_update_time()
+ +       * cannot fail either so there is no need to check the return code.
+ +       */
+ +      file_update_time(file);
+ +      pos = *ppos;
+ +      /* The first byte after the last cluster being written to. */
+ +      end = (pos + *count + vol->cluster_size_mask) &
+ +                      ~(u64)vol->cluster_size_mask;
+ +      /*
+ +       * If the write goes beyond the allocated size, extend the allocation
+ +       * to cover the whole of the write, rounded up to the nearest cluster.
+ +       */
+ +      read_lock_irqsave(&ni->size_lock, flags);
+ +      ll = ni->allocated_size;
+ +      read_unlock_irqrestore(&ni->size_lock, flags);
+ +      if (end > ll) {
+ +              /*
+ +               * Extend the allocation without changing the data size.
+ +               *
+ +               * Note we ensure the allocation is big enough to at least
+ +               * write some data but we do not require the allocation to be
+ +               * complete, i.e. it may be partial.
+ +               */
+ +              ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
+ +              if (likely(ll >= 0)) {
+ +                      BUG_ON(pos >= ll);
+ +                      /* If the extension was partial truncate the write. */
+ +                      if (end > ll) {
+ +                              ntfs_debug("Truncating write to inode 0x%lx, "
+ +                                              "attribute type 0x%x, because "
+ +                                              "the allocation was only "
+ +                                              "partially extended.",
+ +                                              vi->i_ino, (unsigned)
+ +                                              le32_to_cpu(ni->type));
+ +                              *count = ll - pos;
+ +                      }
+ +              } else {
+ +                      err = ll;
+ +                      read_lock_irqsave(&ni->size_lock, flags);
+ +                      ll = ni->allocated_size;
+ +                      read_unlock_irqrestore(&ni->size_lock, flags);
+ +                      /* Perform a partial write if possible or fail. */
+ +                      if (pos < ll) {
+ +                              ntfs_debug("Truncating write to inode 0x%lx "
+ +                                              "attribute type 0x%x, because "
+ +                                              "extending the allocation "
+ +                                              "failed (error %d).",
+ +                                              vi->i_ino, (unsigned)
+ +                                              le32_to_cpu(ni->type),
+ +                                              (int)-err);
+ +                              *count = ll - pos;
+ +                      } else {
+ +                              if (err != -ENOSPC)
+ +                                      ntfs_error(vi->i_sb, "Cannot perform "
+ +                                                      "write to inode "
+ +                                                      "0x%lx, attribute "
+ +                                                      "type 0x%x, because "
+ +                                                      "extending the "
+ +                                                      "allocation failed "
+ +                                                      "(error %ld).",
+ +                                                      vi->i_ino, (unsigned)
+ +                                                      le32_to_cpu(ni->type),
+ +                                                      (long)-err);
+ +                              else
+ +                                      ntfs_debug("Cannot perform write to "
+ +                                                      "inode 0x%lx, "
+ +                                                      "attribute type 0x%x, "
+ +                                                      "because there is not "
+ +                                                      "space left.",
+ +                                                      vi->i_ino, (unsigned)
+ +                                                      le32_to_cpu(ni->type));
+ +                              goto out;
+ +                      }
+ +              }
+ +      }
+ +      /*
+ +       * If the write starts beyond the initialized size, extend it up to the
+ +       * beginning of the write and initialize all non-sparse space between
+ +       * the old initialized size and the new one.  This automatically also
+ +       * increments the vfs inode->i_size to keep it above or equal to the
+ +       * initialized_size.
+ +       */
+ +      read_lock_irqsave(&ni->size_lock, flags);
+ +      ll = ni->initialized_size;
+ +      read_unlock_irqrestore(&ni->size_lock, flags);
+ +      if (pos > ll) {
+ +              /*
+ +               * Wait for ongoing direct i/o to complete before proceeding.
+ +               * New direct i/o cannot start as we hold i_mutex.
+ +               */
+ +              inode_dio_wait(vi);
+ +              err = ntfs_attr_extend_initialized(ni, pos);
+ +              if (unlikely(err < 0))
+ +                      ntfs_error(vi->i_sb, "Cannot perform write to inode "
+ +                                      "0x%lx, attribute type 0x%x, because "
+ +                                      "extending the initialized size "
+ +                                      "failed (error %d).", vi->i_ino,
+ +                                      (unsigned)le32_to_cpu(ni->type),
+ +                                      (int)-err);
+ +      }
+ +out:
+ +      return err;
   }
   
   /**
@@@ -527,8 -420,8 +526,8 @@@ static inline int __ntfs_grab_cache_pag
                                         goto err_out;
                                 }
                         }
- -                      err = add_to_page_cache_lru(*cached_page, mapping, index,
- -                                      GFP_KERNEL);
+ +                      err = add_to_page_cache_lru(*cached_page, mapping,
+ +                                      index, GFP_KERNEL);
                         if (unlikely(err)) {
                                 if (err == -EEXIST)
                                         continue;
@@@ -1374,6 -1267,180 +1373,6 @@@ rl_not_mapped_enoent
         return err;
   }
   
- -/*
- - * Copy as much as we can into the pages and return the number of bytes which
- - * were successfully copied.  If a fault is encountered then clear the pages
- - * out to (ofs + bytes) and return the number of bytes which were copied.
- - */
- -static inline size_t ntfs_copy_from_user(struct page **pages,
- -              unsigned nr_pages, unsigned ofs, const char __user *buf,
- -              size_t bytes)
- -{
- -      struct page **last_page = pages + nr_pages;
- -      char *addr;
- -      size_t total = 0;
- -      unsigned len;
- -      int left;
- -
- -      do {
- -              len = PAGE_CACHE_SIZE - ofs;
- -              if (len > bytes)
- -                      len = bytes;
- -              addr = kmap_atomic(*pages);
- -              left = __copy_from_user_inatomic(addr + ofs, buf, len);
- -              kunmap_atomic(addr);
- -              if (unlikely(left)) {
- -                      /* Do it the slow way. */
- -                      addr = kmap(*pages);
- -                      left = __copy_from_user(addr + ofs, buf, len);
- -                      kunmap(*pages);
- -                      if (unlikely(left))
- -                              goto err_out;
- -              }
- -              total += len;
- -              bytes -= len;
- -              if (!bytes)
- -                      break;
- -              buf += len;
- -              ofs = 0;
- -      } while (++pages < last_page);
- -out:
- -      return total;
- -err_out:
- -      total += len - left;
- -      /* Zero the rest of the target like __copy_from_user(). */
- -      while (++pages < last_page) {
- -              bytes -= len;
- -              if (!bytes)
- -                      break;
- -              len = PAGE_CACHE_SIZE;
- -              if (len > bytes)
- -                      len = bytes;
- -              zero_user(*pages, 0, len);
- -      }
- -      goto out;
- -}
- -
- -static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
- -              const struct iovec *iov, size_t iov_ofs, size_t bytes)
- -{
- -      size_t total = 0;
- -
- -      while (1) {
- -              const char __user *buf = iov->iov_base + iov_ofs;
- -              unsigned len;
- -              size_t left;
- -
- -              len = iov->iov_len - iov_ofs;
- -              if (len > bytes)
- -                      len = bytes;
- -              left = __copy_from_user_inatomic(vaddr, buf, len);
- -              total += len;
- -              bytes -= len;
- -              vaddr += len;
- -              if (unlikely(left)) {
- -                      total -= left;
- -                      break;
- -              }
- -              if (!bytes)
- -                      break;
- -              iov++;
- -              iov_ofs = 0;
- -      }
- -      return total;
- -}
- -
- -static inline void ntfs_set_next_iovec(const struct iovec **iovp,
- -              size_t *iov_ofsp, size_t bytes)
- -{
- -      const struct iovec *iov = *iovp;
- -      size_t iov_ofs = *iov_ofsp;
- -
- -      while (bytes) {
- -              unsigned len;
- -
- -              len = iov->iov_len - iov_ofs;
- -              if (len > bytes)
- -                      len = bytes;
- -              bytes -= len;
- -              iov_ofs += len;
- -              if (iov->iov_len == iov_ofs) {
- -                      iov++;
- -                      iov_ofs = 0;
- -              }
- -      }
- -      *iovp = iov;
- -      *iov_ofsp = iov_ofs;
- -}
- -
- -/*
- - * This has the same side-effects and return value as ntfs_copy_from_user().
- - * The difference is that on a fault we need to memset the remainder of the
- - * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
- - * single-segment behaviour.
- - *
- - * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
- - * atomic and when not atomic.  This is ok because it calls
- - * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
- - * fact, the only difference between __copy_from_user_inatomic() and
- - * __copy_from_user() is that the latter calls might_sleep() and the former
- - * should not zero the tail of the buffer on error.  And on many architectures
- - * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
- - * makes no difference at all on those architectures.
- - */
- -static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
- -              unsigned nr_pages, unsigned ofs, const struct iovec **iov,
- -              size_t *iov_ofs, size_t bytes)
- -{
- -      struct page **last_page = pages + nr_pages;
- -      char *addr;
- -      size_t copied, len, total = 0;
- -
- -      do {
- -              len = PAGE_CACHE_SIZE - ofs;
- -              if (len > bytes)
- -                      len = bytes;
- -              addr = kmap_atomic(*pages);
- -              copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
- -                              *iov, *iov_ofs, len);
- -              kunmap_atomic(addr);
- -              if (unlikely(copied != len)) {
- -                      /* Do it the slow way. */
- -                      addr = kmap(*pages);
- -                      copied = __ntfs_copy_from_user_iovec_inatomic(addr +
- -                                      ofs, *iov, *iov_ofs, len);
- -                      if (unlikely(copied != len))
- -                              goto err_out;
- -                      kunmap(*pages);
- -              }
- -              total += len;
- -              ntfs_set_next_iovec(iov, iov_ofs, len);
- -              bytes -= len;
- -              if (!bytes)
- -                      break;
- -              ofs = 0;
- -      } while (++pages < last_page);
- -out:
- -      return total;
- -err_out:
- -      BUG_ON(copied > len);
- -      /* Zero the rest of the target like __copy_from_user(). */
- -      memset(addr + ofs + copied, 0, len - copied);
- -      kunmap(*pages);
- -      total += copied;
- -      ntfs_set_next_iovec(iov, iov_ofs, copied);
- -      while (++pages < last_page) {
- -              bytes -= len;
- -              if (!bytes)
- -                      break;
- -              len = PAGE_CACHE_SIZE;
- -              if (len > bytes)
- -                      len = bytes;
- -              zero_user(*pages, 0, len);
- -      }
- -      goto out;
- -}
- -
   static inline void ntfs_flush_dcache_pages(struct page **pages,
                 unsigned nr_pages)
   {
@@@ -1694,83 -1761,86 +1693,83 @@@ err_out
         return err;
   }
   
- -static void ntfs_write_failed(struct address_space *mapping, loff_t to)
+ +/*
+ + * Copy as much as we can into the pages and return the number of bytes which
+ + * were successfully copied.  If a fault is encountered then clear the pages
+ + * out to (ofs + bytes) and return the number of bytes which were copied.
+ + */
+ +static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
+ +              unsigned ofs, struct iov_iter *i, size_t bytes)
   {
- -      struct inode *inode = mapping->host;
+ +      struct page **last_page = pages + nr_pages;
+ +      size_t total = 0;
+ +      struct iov_iter data = *i;
+ +      unsigned len, copied;
   
- -      if (to > inode->i_size) {
- -              truncate_pagecache(inode, inode->i_size);
- -              ntfs_truncate_vfs(inode);
- -      }
+ +      do {
+ +              len = PAGE_CACHE_SIZE - ofs;
+ +              if (len > bytes)
+ +                      len = bytes;
+ +              copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs,
+ +                              len);
+ +              total += copied;
+ +              bytes -= copied;
+ +              if (!bytes)
+ +                      break;
+ +              iov_iter_advance(&data, copied);
+ +              if (copied < len)
+ +                      goto err;
+ +              ofs = 0;
+ +      } while (++pages < last_page);
+ +out:
+ +      return total;
+ +err:
+ +      /* Zero the rest of the target like __copy_from_user(). */
+ +      len = PAGE_CACHE_SIZE - copied;
+ +      do {
+ +              if (len > bytes)
+ +                      len = bytes;
+ +              zero_user(*pages, copied, len);
+ +              bytes -= len;
+ +              copied = 0;
+ +              len = PAGE_CACHE_SIZE;
+ +      } while (++pages < last_page);
+ +      goto out;
   }
   
   /**
- - * ntfs_file_buffered_write -
- - *
- - * Locking: The vfs is holding ->i_mutex on the inode.
+ + * ntfs_perform_write - perform buffered write to a file
+ + * @file:     file to write to
+ + * @i:                iov_iter with data to write
+ + * @pos:      byte offset in file at which to begin writing to
    */
- -static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
- -              const struct iovec *iov, unsigned long nr_segs,
- -              loff_t pos, loff_t *ppos, size_t count)
+ +static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
+ +              loff_t pos)
   {
- -      struct file *file = iocb->ki_filp;
         struct address_space *mapping = file->f_mapping;
         struct inode *vi = mapping->host;
         ntfs_inode *ni = NTFS_I(vi);
         ntfs_volume *vol = ni->vol;
         struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
         struct page *cached_page = NULL;
- -      char __user *buf = NULL;
- -      s64 end, ll;
         VCN last_vcn;
         LCN lcn;
- -      unsigned long flags;
- -      size_t bytes, iov_ofs = 0;      /* Offset in the current iovec. */
- -      ssize_t status, written;
+ +      size_t bytes;
+ +      ssize_t status, written = 0;
         unsigned nr_pages;
- -      int err;
   
- -      ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
- -                      "pos 0x%llx, count 0x%lx.",
- -                      vi->i_ino, (unsigned)le32_to_cpu(ni->type),
- -                      (unsigned long long)pos, (unsigned long)count);
- -      if (unlikely(!count))
- -              return 0;
- -      BUG_ON(NInoMstProtected(ni));
- -      /*
- -       * If the attribute is not an index root and it is encrypted or
- -       * compressed, we cannot write to it yet.  Note we need to check for
- -       * AT_INDEX_ALLOCATION since this is the type of both directory and
- -       * index inodes.
- -       */
- -      if (ni->type != AT_INDEX_ALLOCATION) {
- -              /* If file is encrypted, deny access, just like NT4. */
- -              if (NInoEncrypted(ni)) {
- -                      /*
- -                       * Reminder for later: Encrypted files are _always_
- -                       * non-resident so that the content can always be
- -                       * encrypted.
- -                       */
- -                      ntfs_debug("Denying write access to encrypted file.");
- -                      return -EACCES;
- -              }
- -              if (NInoCompressed(ni)) {
- -                      /* Only unnamed $DATA attribute can be compressed. */
- -                      BUG_ON(ni->type != AT_DATA);
- -                      BUG_ON(ni->name_len);
- -                      /*
- -                       * Reminder for later: If resident, the data is not
- -                       * actually compressed.  Only on the switch to non-
- -                       * resident does compression kick in.  This is in
- -                       * contrast to encrypted files (see above).
- -                       */
- -                      ntfs_error(vi->i_sb, "Writing to compressed files is "
- -                                      "not implemented yet.  Sorry.");
- -                      return -EOPNOTSUPP;
- -              }
- -      }
+ +      ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
+ +                      "0x%llx, count 0x%lx.", vi->i_ino,
+ +                      (unsigned)le32_to_cpu(ni->type),
+ +                      (unsigned long long)pos,
+ +                      (unsigned long)iov_iter_count(i));
         /*
          * If a previous ntfs_truncate() failed, repeat it and abort if it
          * fails again.
          */
         if (unlikely(NInoTruncateFailed(ni))) {
+ +              int err;
+ +
                 inode_dio_wait(vi);
                 err = ntfs_truncate(vi);
                 if (err || NInoTruncateFailed(ni)) {
@@@ -1784,6 -1854,81 +1783,6 @@@
                         return err;
                 }
         }
- -      /* The first byte after the write. */
- -      end = pos + count;
- -      /*
- -       * If the write goes beyond the allocated size, extend the allocation
- -       * to cover the whole of the write, rounded up to the nearest cluster.
- -       */
- -      read_lock_irqsave(&ni->size_lock, flags);
- -      ll = ni->allocated_size;
- -      read_unlock_irqrestore(&ni->size_lock, flags);
- -      if (end > ll) {
- -              /* Extend the allocation without changing the data size. */
- -              ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
- -              if (likely(ll >= 0)) {
- -                      BUG_ON(pos >= ll);
- -                      /* If the extension was partial truncate the write. */
- -                      if (end > ll) {
- -                              ntfs_debug("Truncating write to inode 0x%lx, "
- -                                              "attribute type 0x%x, because "
- -                                              "the allocation was only "
- -                                              "partially extended.",
- -                                              vi->i_ino, (unsigned)
- -                                              le32_to_cpu(ni->type));
- -                              end = ll;
- -                              count = ll - pos;
- -                      }
- -              } else {
- -                      err = ll;
- -                      read_lock_irqsave(&ni->size_lock, flags);
- -                      ll = ni->allocated_size;
- -                      read_unlock_irqrestore(&ni->size_lock, flags);
- -                      /* Perform a partial write if possible or fail. */
- -                      if (pos < ll) {
- -                              ntfs_debug("Truncating write to inode 0x%lx, "
- -                                              "attribute type 0x%x, because "
- -                                              "extending the allocation "
- -                                              "failed (error code %i).",
- -                                              vi->i_ino, (unsigned)
- -                                              le32_to_cpu(ni->type), err);
- -                              end = ll;
- -                              count = ll - pos;
- -                      } else {
- -                              ntfs_error(vol->sb, "Cannot perform write to "
- -                                              "inode 0x%lx, attribute type "
- -                                              "0x%x, because extending the "
- -                                              "allocation failed (error "
- -                                              "code %i).", vi->i_ino,
- -                                              (unsigned)
- -                                              le32_to_cpu(ni->type), err);
- -                              return err;
- -                      }
- -              }
- -      }
- -      written = 0;
- -      /*
- -       * If the write starts beyond the initialized size, extend it up to the
- -       * beginning of the write and initialize all non-sparse space between
- -       * the old initialized size and the new one.  This automatically also
- -       * increments the vfs inode->i_size to keep it above or equal to the
- -       * initialized_size.
- -       */
- -      read_lock_irqsave(&ni->size_lock, flags);
- -      ll = ni->initialized_size;
- -      read_unlock_irqrestore(&ni->size_lock, flags);
- -      if (pos > ll) {
- -              err = ntfs_attr_extend_initialized(ni, pos);
- -              if (err < 0) {
- -                      ntfs_error(vol->sb, "Cannot perform write to inode "
- -                                      "0x%lx, attribute type 0x%x, because "
- -                                      "extending the initialized size "
- -                                      "failed (error code %i).", vi->i_ino,
- -                                      (unsigned)le32_to_cpu(ni->type), err);
- -                      status = err;
- -                      goto err_out;
- -              }
- -      }
         /*
          * Determine the number of pages per cluster for non-resident
          * attributes.
@@@ -1791,7 -1936,10 +1790,7 @@@
         nr_pages = 1;
         if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
                 nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
- -      /* Finally, perform the actual write. */
         last_vcn = -1;
- -      if (likely(nr_segs == 1))
- -              buf = iov->iov_base;
         do {
                 VCN vcn;
                 pgoff_t idx, start_idx;
@@@ -1816,10 -1964,10 +1815,10 @@@
                                                 vol->cluster_size_bits, false);
                                 up_read(&ni->runlist.lock);
                                 if (unlikely(lcn < LCN_HOLE)) {
- -                                      status = -EIO;
                                         if (lcn == LCN_ENOMEM)
                                                 status = -ENOMEM;
- -                                      else
+ +                                      else {
+ +                                              status = -EIO;
                                                 ntfs_error(vol->sb, "Cannot "
                                                         "perform write to "
                                                         "inode 0x%lx, "
@@@ -1828,7 -1976,6 +1827,7 @@@
                                                         "is corrupt.",
                                                         vi->i_ino, (unsigned)
                                                         le32_to_cpu(ni->type));
+ +                                      }
                                         break;
                                 }
                                 if (lcn == LCN_HOLE) {
@@@ -1841,9 -1988,8 +1840,9 @@@
                                 }
                         }
                 }
- -              if (bytes > count)
- -                      bytes = count;
+ +              if (bytes > iov_iter_count(i))
+ +                      bytes = iov_iter_count(i);
+ +again:
                 /*
                  * Bring in the user page(s) that we will copy from _first_.
                  * Otherwise there is a nasty deadlock on copying from the same
@@@ -1852,10 -1998,10 +1851,10 @@@
                  * pages being swapped out between us bringing them into memory
                  * and doing the actual copying.
                  */
- -              if (likely(nr_segs == 1))
- -                      ntfs_fault_in_pages_readable(buf, bytes);
- -              else
- -                      ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
+ +              if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) {
+ +                      status = -EFAULT;
+ +                      break;
+ +              }
                 /* Get and lock @do_pages starting at index @start_idx. */
                 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
                                 pages, &cached_page);
@@@ -1871,57 -2017,56 +1870,57 @@@
                         status = ntfs_prepare_pages_for_non_resident_write(
                                         pages, do_pages, pos, bytes);
                         if (unlikely(status)) {
- -                              loff_t i_size;
- -
                                 do {
                                         unlock_page(pages[--do_pages]);
                                         page_cache_release(pages[do_pages]);
                                 } while (do_pages);
- -                              /*
- -                               * The write preparation may have instantiated
- -                               * allocated space outside i_size.  Trim this
- -                               * off again.  We can ignore any errors in this
- -                               * case as we will just be waisting a bit of
- -                               * allocated space, which is not a disaster.
- -                               */
- -                              i_size = i_size_read(vi);
- -                              if (pos + bytes > i_size) {
- -                                      ntfs_write_failed(mapping, pos + bytes);
- -                              }
                                 break;
                         }
                 }
                 u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
- -              if (likely(nr_segs == 1)) {
- -                      copied = ntfs_copy_from_user(pages + u, do_pages - u,
- -                                      ofs, buf, bytes);
- -                      buf += copied;
- -              } else
- -                      copied = ntfs_copy_from_user_iovec(pages + u,
- -                                      do_pages - u, ofs, &iov, &iov_ofs,
- -                                      bytes);
+ +              copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
+ +                                      i, bytes);
                 ntfs_flush_dcache_pages(pages + u, do_pages - u);
- -              status = ntfs_commit_pages_after_write(pages, do_pages, pos,
- -                              bytes);
- -              if (likely(!status)) {
- -                      written += copied;
- -                      count -= copied;
- -                      pos += copied;
- -                      if (unlikely(copied != bytes))
- -                              status = -EFAULT;
+ +              status = 0;
+ +              if (likely(copied == bytes)) {
+ +                      status = ntfs_commit_pages_after_write(pages, do_pages,
+ +                                      pos, bytes);
+ +                      if (!status)
+ +                              status = bytes;
                 }
                 do {
                         unlock_page(pages[--do_pages]);
                         page_cache_release(pages[do_pages]);
                 } while (do_pages);
- -              if (unlikely(status))
+ +              if (unlikely(status < 0))
                         break;
- -              balance_dirty_pages_ratelimited(mapping);
+ +              copied = status;
                 cond_resched();
- -      } while (count);
- -err_out:
- -      *ppos = pos;
+ +              if (unlikely(!copied)) {
+ +                      size_t sc;
+ +
+ +                      /*
+ +                       * We failed to copy anything.  Fall back to single
+ +                       * segment length write.
+ +                       *
+ +                       * This is needed to avoid possible livelock in the
+ +                       * case that all segments in the iov cannot be copied
+ +                       * at once without a pagefault.
+ +                       */
+ +                      sc = iov_iter_single_seg_count(i);
+ +                      if (bytes > sc)
+ +                              bytes = sc;
+ +                      goto again;
+ +              }
+ +              iov_iter_advance(i, copied);
+ +              pos += copied;
+ +              written += copied;
+ +              balance_dirty_pages_ratelimited(mapping);
+ +              if (fatal_signal_pending(current)) {
+ +                      status = -EINTR;
+ +                      break;
+ +              }
+ +      } while (iov_iter_count(i));
         if (cached_page)
                 page_cache_release(cached_page);
         ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
@@@ -1931,56 -2076,59 +1930,56 @@@
   }
   
   /**
- - * ntfs_file_aio_write_nolock -
+ + * ntfs_file_write_iter_nolock - write data to a file
+ + * @iocb:     IO state structure (file, offset, etc.)
+ + * @from:     iov_iter with data to write
+ + *
+ + * Basically the same as __generic_file_write_iter() except that it ends
+ + * up calling ntfs_perform_write() instead of generic_perform_write() and that
+ + * O_DIRECT is not implemented.
    */
- -static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
- -              const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
+ +static ssize_t ntfs_file_write_iter_nolock(struct kiocb *iocb,
+ +              struct iov_iter *from)
   {
         struct file *file = iocb->ki_filp;
- -      struct address_space *mapping = file->f_mapping;
- -      struct inode *inode = mapping->host;
- -      loff_t pos;
- -      size_t count;           /* after file limit checks */
- -      ssize_t written, err;
+ +      loff_t pos = iocb->ki_pos;
+ +      ssize_t written = 0;
+ +      ssize_t err;
+ +      size_t count = iov_iter_count(from);
   
- -      count = iov_length(iov, nr_segs);
- -      pos = *ppos;
- -      /* We can write back this queue in page reclaim. */
- -      current->backing_dev_info = inode_to_bdi(inode);
- -      written = 0;
- -      err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
- -      if (err)
- -              goto out;
- -      if (!count)
- -              goto out;
- -      err = file_remove_suid(file);
- -      if (err)
- -              goto out;
- -      err = file_update_time(file);
- -      if (err)
- -              goto out;
- -      written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
- -                      count);
- -out:
+ +      err = ntfs_prepare_file_for_write(file, &pos, &count);
+ +      if (count && !err) {
+ +              iov_iter_truncate(from, count);
+ +              written = ntfs_perform_write(file, from, pos);
+ +              if (likely(written >= 0))
+ +                      iocb->ki_pos = pos + written;
+ +      }
         current->backing_dev_info = NULL;
         return written ? written : err;
   }
   
   /**
- - * ntfs_file_aio_write -
+ + * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
+ + * @iocb:     IO state structure
+ + * @from:     iov_iter with data to write
+ + *
+ + * Basically the same as generic_file_write_iter() except that it ends up
+ + * calling ntfs_file_write_iter_nolock() instead of
+ + * __generic_file_write_iter().
    */
- -static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- -              unsigned long nr_segs, loff_t pos)
+ +static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   {
         struct file *file = iocb->ki_filp;
- -      struct address_space *mapping = file->f_mapping;
- -      struct inode *inode = mapping->host;
+ +      struct inode *vi = file_inode(file);
         ssize_t ret;
   
- -      BUG_ON(iocb->ki_pos != pos);
- -
- -      mutex_lock(&inode->i_mutex);
- -      ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
- -      mutex_unlock(&inode->i_mutex);
+ +      mutex_lock(&vi->i_mutex);
+ +      ret = ntfs_file_write_iter_nolock(iocb, from);
+ +      mutex_unlock(&vi->i_mutex);
         if (ret > 0) {
- -              int err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ +              ssize_t err;
+ +
+ +              err = generic_write_sync(file, iocb->ki_pos - ret, ret);
                 if (err < 0)
                         ret = err;
         }
@@@ -2048,17 -2196,37 +2047,17 @@@ static int ntfs_file_fsync(struct file 
   #endif /* NTFS_RW */
   
   const struct file_operations ntfs_file_ops = {
- -      .llseek         = generic_file_llseek,   /* Seek inside file. */
- -      .read           = new_sync_read,         /* Read from file. */
- -      .read_iter      = generic_file_read_iter, /* Async read from file. */
+ +      .llseek         = generic_file_llseek,
+ +      .read           = new_sync_read,
+ +      .read_iter      = generic_file_read_iter,
   #ifdef NTFS_RW
- -      .write          = do_sync_write,         /* Write to file. */
- -      .aio_write      = ntfs_file_aio_write,   /* Async write to file. */
- -      /*.release      = ,*/                    /* Last file is closed.  See
- -                                                  fs/ext2/file.c::
- -                                                  ext2_release_file() for
- -                                                  how to use this to discard
- -                                                  preallocated space for
- -                                                  write opened files. */
- -      .fsync          = ntfs_file_fsync,       /* Sync a file to disk. */
- -      /*.aio_fsync    = ,*/                    /* Sync all outstanding async
- -                                                  i/o operations on a
- -                                                  kiocb. */
+ +      .write          = new_sync_write,
+ +      .write_iter     = ntfs_file_write_iter,
+ +      .fsync          = ntfs_file_fsync,
   #endif /* NTFS_RW */
- -      /*.ioctl        = ,*/                    /* Perform function on the
- -                                                  mounted filesystem. */
- -      .mmap           = generic_file_mmap,     /* Mmap file. */
- -      .open           = ntfs_file_open,        /* Open file. */
- -      .splice_read    = generic_file_splice_read /* Zero-copy data send with
- -                                                  the data source being on
- -                                                  the ntfs partition.  We do
- -                                                  not need to care about the
- -                                                  data destination. */
- -      /*.sendpage     = ,*/                    /* Zero-copy data send with
- -                                                  the data destination being
- -                                                  on the ntfs partition.  We
- -                                                  do not need to care about
- -                                                  the data source. */
+ +      .mmap           = generic_file_mmap,
+ +      .open           = ntfs_file_open,
+ +      .splice_read    = generic_file_splice_read,
   };
   
   const struct inode_operations ntfs_file_inode_ops = {
diff --combined fs/xfs/xfs_file.c

index a2e1cb8,f527618..f44212f
--- 1/fs/xfs/xfs_file.c
--- 2/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@@ -36,9 -36,7 +36,8 @@@
   #include "xfs_trace.h"
   #include "xfs_log.h"
   #include "xfs_icache.h"
+ +#include "xfs_pnfs.h"
   
- #include <linux/aio.h>
   #include <linux/dcache.h>
   #include <linux/falloc.h>
   #include <linux/pagevec.h>
@@@ -397,8 -395,7 +396,8 @@@ STATIC int                         /* error (positive) *
   xfs_zero_last_block(
         struct xfs_inode        *ip,
         xfs_fsize_t             offset,
- -      xfs_fsize_t             isize)
+ +      xfs_fsize_t             isize,
+ +      bool                    *did_zeroing)
   {
         struct xfs_mount        *mp = ip->i_mount;
         xfs_fileoff_t           last_fsb = XFS_B_TO_FSBT(mp, isize);
@@@ -426,7 -423,6 +425,7 @@@
         zero_len = mp->m_sb.sb_blocksize - zero_offset;
         if (isize + zero_len > offset)
                 zero_len = offset - isize;
+ +      *did_zeroing = true;
         return xfs_iozero(ip, isize, zero_len);
   }
   
@@@ -445,8 -441,7 +444,8 @@@ int                                        /* error (positive) *
   xfs_zero_eof(
         struct xfs_inode        *ip,
         xfs_off_t               offset,         /* starting I/O offset */
- -      xfs_fsize_t             isize)          /* current inode size */
+ +      xfs_fsize_t             isize,          /* current inode size */
+ +      bool                    *did_zeroing)
   {
         struct xfs_mount        *mp = ip->i_mount;
         xfs_fileoff_t           start_zero_fsb;
@@@ -468,7 -463,7 +467,7 @@@
          * We only zero a part of that block so it is handled specially.
          */
         if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
- -              error = xfs_zero_last_block(ip, offset, isize);
+ +              error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
                 if (error)
                         return error;
         }
@@@ -528,7 -523,6 +527,7 @@@
                 if (error)
                         return error;
   
+ +              *did_zeroing = true;
                 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
                 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
         }
@@@ -559,10 -553,6 +558,10 @@@ restart
         if (error)
                 return error;
   
+ +      error = xfs_break_layouts(inode, iolock);
+ +      if (error)
+ +              return error;
+ +
         /*
          * If the offset is beyond the size of the file, we need to zero any
          * blocks that fall between the existing EOF and the start of this
@@@ -571,15 -561,13 +570,15 @@@
          * having to redo all checks before.
          */
         if (*pos > i_size_read(inode)) {
+ +              bool    zero = false;
+ +
                 if (*iolock == XFS_IOLOCK_SHARED) {
                         xfs_rw_iunlock(ip, *iolock);
                         *iolock = XFS_IOLOCK_EXCL;
                         xfs_rw_ilock(ip, *iolock);
                         goto restart;
                 }
- -              error = xfs_zero_eof(ip, *pos, i_size_read(inode));
+ +              error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
                 if (error)
                         return error;
         }
@@@ -833,7 -821,6 +832,7 @@@ xfs_file_fallocate
         struct xfs_inode        *ip = XFS_I(inode);
         long                    error;
         enum xfs_prealloc_flags flags = 0;
+ +      uint                    iolock = XFS_IOLOCK_EXCL;
         loff_t                  new_size = 0;
   
         if (!S_ISREG(inode->i_mode))
@@@ -842,11 -829,7 +841,11 @@@
                      FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                 return -EOPNOTSUPP;
   
- -      xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ +      xfs_ilock(ip, iolock);
+ +      error = xfs_break_layouts(inode, &iolock);
+ +      if (error)
+ +              goto out_unlock;
+ +
         if (mode & FALLOC_FL_PUNCH_HOLE) {
                 error = xfs_free_file_space(ip, offset, len);
                 if (error)
@@@ -910,7 -893,7 +909,7 @@@
         }
   
   out_unlock:
- -      xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ +      xfs_iunlock(ip, iolock);
         return error;
   }
   
diff --combined include/linux/fs.h

index d70e333,48c1472..dfbd88a
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -314,6 -314,28 +314,28 @@@ struct page
   struct address_space;
   struct writeback_control;
   
+ #define IOCB_EVENTFD          (1 << 0)
+ 
+ struct kiocb {
+       struct file             *ki_filp;
+       loff_t                  ki_pos;
+       void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
+       void                    *private;
+       int                     ki_flags;
+ };
+ 
+ static inline bool is_sync_kiocb(struct kiocb *kiocb)
+ {
+       return kiocb->ki_complete == NULL;
+ }
+ 
+ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
+ {
+       *kiocb = (struct kiocb) {
+               .ki_filp = filp,
+       };
+ }
+ 
   /*
    * "descriptor" for what we're up to with a read.
    * This allows us to use the same read code yet
@@@ -968,6 -990,9 +990,6 @@@ struct file_lock_context 
         struct list_head        flc_flock;
         struct list_head        flc_posix;
         struct list_head        flc_lease;
- -      int                     flc_flock_cnt;
- -      int                     flc_posix_cnt;
- -      int                     flc_lease_cnt;
   };
   
   /* The following constant reflects the upper bound of the file/locking space */
@@@ -2144,7 -2169,7 +2166,7 @@@ struct filename 
         const __user char       *uptr;  /* original userland pointer */
         struct audit_names      *aname;
         int                     refcnt;
- -      bool                    separate; /* should "name" be freed? */
+ +      const char              iname[];
   };
   
   extern long vfs_truncate(struct path *, loff_t);
diff --combined kernel/printk/printk.c

index bb0635b,40d50cc..879edfc
--- 1/kernel/printk/printk.c
--- 2/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@@ -32,7 -32,6 +32,6 @@@
   #include <linux/security.h>
   #include <linux/bootmem.h>
   #include <linux/memblock.h>
- #include <linux/aio.h>
   #include <linux/syscalls.h>
   #include <linux/kexec.h>
   #include <linux/kdb.h>
@@@ -46,6 -45,7 +45,7 @@@
   #include <linux/irq_work.h>
   #include <linux/utsname.h>
   #include <linux/ctype.h>
+ #include <linux/uio.h>
   
   #include <asm/uaccess.h>
   
@@@ -521,7 -521,7 +521,7 @@@ static ssize_t devkmsg_write(struct kio
         int i;
         int level = default_message_loglevel;
         int facility = 1;       /* LOG_USER */
-       size_t len = iocb->ki_nbytes;
+       size_t len = iov_iter_count(from);
         ssize_t ret = len;
   
         if (len > LOG_LINE_MAX)
@@@ -1811,7 -1811,7 +1811,7 @@@ int vprintk_default(const char *fmt, va
   
   #ifdef CONFIG_KGDB_KDB
         if (unlikely(kdb_trap_printk)) {
- -              r = vkdb_printf(fmt, args);
+ +              r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
                 return r;
         }
   #endif
@@@ -2464,7 -2464,6 +2464,7 @@@ void register_console(struct console *n
         for (i = 0, c = console_cmdline;
              i < MAX_CMDLINECONSOLES && c->name[0];
              i++, c++) {
+ +              BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
                 if (strcmp(c->name, newcon->name) != 0)
                         continue;
                 if (newcon->index >= 0 &&
diff --combined mm/shmem.c

index cf2d0ca,944b940..80b360c
--- 1/mm/shmem.c
--- 2/mm/shmem.c
+++ b/mm/shmem.c
@@@ -31,7 -31,7 +31,7 @@@
   #include <linux/mm.h>
   #include <linux/export.h>
   #include <linux/swap.h>
- #include <linux/aio.h>
+ #include <linux/uio.h>
   
   static struct vfsmount *shm_mnt;
   
@@@ -1455,9 -1455,6 +1455,9 @@@ static struct inode *shmem_get_inode(st
   
   bool shmem_mapping(struct address_space *mapping)
   {
+ +      if (!mapping->host)
+ +              return false;
+ +
         return mapping->host->i_sb->s_op == &shmem_ops;
   }
   
@@@ -2322,8 -2319,8 +2322,8 @@@ static int shmem_rmdir(struct inode *di
   
   static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
   {
- -      bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
- -      bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode);
+ +      bool old_is_dir = d_is_dir(old_dentry);
+ +      bool new_is_dir = d_is_dir(new_dentry);
   
         if (old_dir != new_dir && old_is_dir != new_is_dir) {
                 if (old_is_dir) {
diff --combined net/socket.c

index 245330c,f6c519d..1dbff3e
--- 1/net/socket.c
--- 2/net/socket.c
+++ b/net/socket.c
@@@ -633,8 -633,7 +633,7 @@@ static int do_sock_sendmsg(struct socke
         init_sync_kiocb(&iocb, NULL);
         ret = nosec ? __sock_sendmsg_nosec(&iocb, sock, msg, size) :
                       __sock_sendmsg(&iocb, sock, msg, size);
-       if (-EIOCBQUEUED == ret)
-               ret = wait_on_sync_kiocb(&iocb);
+       BUG_ON(ret == -EIOCBQUEUED);
         return ret;
   }
   
@@@ -766,8 -765,7 +765,7 @@@ int sock_recvmsg(struct socket *sock, s
   
         init_sync_kiocb(&iocb, NULL);
         ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
-       if (-EIOCBQUEUED == ret)
-               ret = wait_on_sync_kiocb(&iocb);
+       BUG_ON(ret == -EIOCBQUEUED);
         return ret;
   }
   EXPORT_SYMBOL(sock_recvmsg);
@@@ -780,8 -778,7 +778,7 @@@ static int sock_recvmsg_nosec(struct so
   
         init_sync_kiocb(&iocb, NULL);
         ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
-       if (-EIOCBQUEUED == ret)
-               ret = wait_on_sync_kiocb(&iocb);
+       BUG_ON(ret == -EIOCBQUEUED);
         return ret;
   }
   
@@@ -858,11 -855,11 +855,11 @@@ static ssize_t sock_read_iter(struct ki
         if (iocb->ki_pos != 0)
                 return -ESPIPE;
   
-       if (iocb->ki_nbytes == 0)       /* Match SYS5 behaviour */
+       if (!iov_iter_count(to))        /* Match SYS5 behaviour */
                 return 0;
   
         res = __sock_recvmsg(iocb, sock, &msg,
-                            iocb->ki_nbytes, msg.msg_flags);
+                            iov_iter_count(to), msg.msg_flags);
         *to = msg.msg_iter;
         return res;
   }
@@@ -883,7 -880,7 +880,7 @@@ static ssize_t sock_write_iter(struct k
         if (sock->type == SOCK_SEQPACKET)
                 msg.msg_flags |= MSG_EOR;
   
-       res = __sock_sendmsg(iocb, sock, &msg, iocb->ki_nbytes);
+       res = __sock_sendmsg(iocb, sock, &msg, iov_iter_count(from));
         *from = msg.msg_iter;
         return res;
   }
@@@ -1702,8 -1699,6 +1699,8 @@@ SYSCALL_DEFINE6(sendto, int, fd, void _
   
         if (len > INT_MAX)
                 len = INT_MAX;
+ +      if (unlikely(!access_ok(VERIFY_READ, buff, len)))
+ +              return -EFAULT;
         sock = sockfd_lookup_light(fd, &err, &fput_needed);
         if (!sock)
                 goto out;
@@@ -1762,8 -1757,6 +1759,8 @@@ SYSCALL_DEFINE6(recvfrom, int, fd, voi
   
         if (size > INT_MAX)
                 size = INT_MAX;
+ +      if (unlikely(!access_ok(VERIFY_WRITE, ubuf, size)))
+ +              return -EFAULT;
         sock = sockfd_lookup_light(fd, &err, &fput_needed);
         if (!sock)
                 goto out;
diff --combined sound/core/pcm_native.c

index 279e24f,9ecff24..a69ebc7
--- 1/sound/core/pcm_native.c
--- 2/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@@ -25,7 -25,6 +25,6 @@@
   #include <linux/slab.h>
   #include <linux/time.h>
   #include <linux/pm_qos.h>
- #include <linux/aio.h>
   #include <linux/io.h>
   #include <linux/dma-mapping.h>
   #include <sound/core.h>
@@@ -35,6 -34,7 +34,7 @@@
   #include <sound/pcm_params.h>
   #include <sound/timer.h>
   #include <sound/minors.h>
+ #include <linux/uio.h>
   
   /*
    *  Compatibility
@@@ -1552,8 -1552,6 +1552,8 @@@ static int snd_pcm_do_drain_init(struc
                         if (! snd_pcm_playback_empty(substream)) {
                                 snd_pcm_do_start(substream, SNDRV_PCM_STATE_DRAINING);
                                 snd_pcm_post_start(substream, SNDRV_PCM_STATE_DRAINING);
+ +                      } else {
+ +                              runtime->status->state = SNDRV_PCM_STATE_SETUP;
                         }
                         break;
                 case SNDRV_PCM_STATE_RUNNING:
author	Al Viro <viro@zeniv.linux.org.uk>
	Sun, 12 Apr 2015 02:24:41 +0000 (22:24 -0400)
committer	Al Viro <viro@zeniv.linux.org.uk>
	Sun, 12 Apr 2015 02:24:41 +0000 (22:24 -0400)
		1	2
arch/s390/hypfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/infiniband/hw/qib/qib_file_ops.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/sg.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/aio.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ceph/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ecryptfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/indirect.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fuse/dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/direct.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ntfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/printk/printk.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/shmem.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/socket.c	patch \|	diff1 \|	diff2 \|	blob \| history
sound/core/pcm_native.c	patch \|	diff1 \|	diff2 \|	blob \| history