Merge tag 'xfs-for-linus-4.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 1 Jul 2015 03:16:08 +0000 (20:16 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 1 Jul 2015 03:16:08 +0000 (20:16 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Jul 2015 03:16:08 +0000 (20:16 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Jul 2015 03:16:08 +0000 (20:16 -0700)
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt

index 5a5a055..8146e9f 100644 (file)
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -236,10 +236,10 @@ Removed Mount Options
  
    Name                         Removed
    ----                         -------
-  delaylog/nodelaylog          v3.20
-  ihashsize                    v3.20
-  irixsgid                     v3.20
-  osyncisdsync/osyncisosync    v3.20
+  delaylog/nodelaylog          v4.0
+  ihashsize                    v4.0
+  irixsgid                     v4.0
+  osyncisdsync/osyncisosync    v4.0
  
  
  sysctls
@@ -346,5 +346,5 @@ Removed Sysctls
  
    Name                         Removed
    ----                         -------
-  fs.xfs.xfsbufd_centisec      v3.20
-  fs.xfs.age_buffer_centisecs  v3.20
+  fs.xfs.xfsbufd_centisec      v4.0
+  fs.xfs.age_buffer_centisecs  v4.0
diff --git a/fs/dax.c b/fs/dax.c

index 6f65f00..99b5fbc 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -309,14 +309,21 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
   out:
         i_mmap_unlock_read(mapping);
  
-       if (bh->b_end_io)
-               bh->b_end_io(bh, 1);
-
         return error;
  }
  
-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-                       get_block_t get_block)
+/**
+ * __dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files. __dax_fault() assumes the caller has done all
+ * the necessary locking for the page fault to proceed successfully.
+ */
+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+                       get_block_t get_block, dax_iodone_t complete_unwritten)
  {
         struct file *file = vma->vm_file;
         struct address_space *mapping = file->f_mapping;
@@ -417,7 +424,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                 page_cache_release(page);
         }
  
+       /*
+        * If we successfully insert the new mapping over an unwritten extent,
+        * we need to ensure we convert the unwritten extent. If there is an
+        * error inserting the mapping, the filesystem needs to leave it as
+        * unwritten to prevent exposure of the stale underlying data to
+        * userspace, but we still need to call the completion function so
+        * the private resources on the mapping buffer can be released. We
+        * indicate what the callback should do via the uptodate variable, same
+        * as for normal BH based IO completions.
+        */
         error = dax_insert_mapping(inode, &bh, vma, vmf);
+       if (buffer_unwritten(&bh))
+               complete_unwritten(&bh, !error);
  
   out:
         if (error == -ENOMEM)
@@ -434,6 +453,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
         }
         goto out;
  }
+EXPORT_SYMBOL(__dax_fault);
  
  /**
   * dax_fault - handle a page fault on a DAX file
@@ -445,7 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
   * fault handler for DAX files.
   */
  int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-                       get_block_t get_block)
+             get_block_t get_block, dax_iodone_t complete_unwritten)
  {
         int result;
         struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -454,7 +474,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                 sb_start_pagefault(sb);
                 file_update_time(vma->vm_file);
         }
-       result = do_dax_fault(vma, vmf, get_block);
+       result = __dax_fault(vma, vmf, get_block, complete_unwritten);
         if (vmf->flags & FAULT_FLAG_WRITE)
                 sb_end_pagefault(sb);
  
diff --git a/fs/ext2/file.c b/fs/ext2/file.c

index 3a0a6c6..3b57c9f 100644 (file)
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -28,12 +28,12 @@
  #ifdef CONFIG_FS_DAX
  static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
-       return dax_fault(vma, vmf, ext2_get_block);
+       return dax_fault(vma, vmf, ext2_get_block, NULL);
  }
  
  static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
-       return dax_mkwrite(vma, vmf, ext2_get_block);
+       return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
  }
  
  static const struct vm_operations_struct ext2_dax_vm_ops = {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index ac517f1..bc313ac 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -192,15 +192,27 @@ out:
  }
  
  #ifdef CONFIG_FS_DAX
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+       struct inode *inode = bh->b_assoc_map->host;
+       /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+       loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+       int err;
+       if (!uptodate)
+               return;
+       WARN_ON(!buffer_unwritten(bh));
+       err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
+
  static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
-       return dax_fault(vma, vmf, ext4_get_block);
+       return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
                                         /* Is this the right get_block? */
  }
  
  static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
-       return dax_mkwrite(vma, vmf, ext4_get_block);
+       return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
  }
  
  static const struct vm_operations_struct ext4_dax_vm_ops = {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index f8a8d4e..41f8e55 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -656,18 +656,6 @@ has_zeroout:
         return retval;
  }
  
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
-       struct inode *inode = bh->b_assoc_map->host;
-       /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
-       loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
-       int err;
-       if (!uptodate)
-               return;
-       WARN_ON(!buffer_unwritten(bh));
-       err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
  /* Maximum number of blocks we map for direct IO at once. */
  #define DIO_MAX_BLOCKS 4096
  
@@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
  
                 map_bh(bh, inode->i_sb, map.m_pblk);
                 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-               if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+               if (IS_DAX(inode) && buffer_unwritten(bh)) {
+                       /*
+                        * dgc: I suspect unwritten conversion on ext4+DAX is
+                        * fundamentally broken here when there are concurrent
+                        * read/write in progress on this inode.
+                        */
+                       WARN_ON_ONCE(io_end);
                         bh->b_assoc_map = inode->i_mapping;
                         bh->b_private = (void *)(unsigned long)iblock;
-                       bh->b_end_io = ext4_end_io_unwritten;
                 }
                 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
                         set_buffer_defer_completion(bh);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c

index 516162b..f9e9ffe 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -149,13 +149,27 @@ xfs_alloc_compute_aligned(
  {
         xfs_agblock_t   bno;
         xfs_extlen_t    len;
+       xfs_extlen_t    diff;
  
         /* Trim busy sections out of found extent */
         xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
  
+       /*
+        * If we have a largish extent that happens to start before min_agbno,
+        * see if we can shift it into range...
+        */
+       if (bno < args->min_agbno && bno + len > args->min_agbno) {
+               diff = args->min_agbno - bno;
+               if (len > diff) {
+                       bno += diff;
+                       len -= diff;
+               }
+       }
+
         if (args->alignment > 1 && len >= args->minlen) {
                 xfs_agblock_t   aligned_bno = roundup(bno, args->alignment);
-               xfs_extlen_t    diff = aligned_bno - bno;
+
+               diff = aligned_bno - bno;
  
                 *resbno = aligned_bno;
                 *reslen = diff >= len ? 0 : len - diff;
@@ -795,9 +809,13 @@ xfs_alloc_find_best_extent(
                  * The good extent is closer than this one.
                  */
                 if (!dir) {
+                       if (*sbnoa > args->max_agbno)
+                               goto out_use_good;
                         if (*sbnoa >= args->agbno + gdiff)
                                 goto out_use_good;
                 } else {
+                       if (*sbnoa < args->min_agbno)
+                               goto out_use_good;
                         if (*sbnoa <= args->agbno - gdiff)
                                 goto out_use_good;
                 }
@@ -884,6 +902,17 @@ xfs_alloc_ag_vextent_near(
         dofirst = prandom_u32() & 1;
  #endif
  
+       /* handle unitialized agbno range so caller doesn't have to */
+       if (!args->min_agbno && !args->max_agbno)
+               args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
+       ASSERT(args->min_agbno <= args->max_agbno);
+
+       /* clamp agbno to the range if it's outside */
+       if (args->agbno < args->min_agbno)
+               args->agbno = args->min_agbno;
+       if (args->agbno > args->max_agbno)
+               args->agbno = args->max_agbno;
+
  restart:
         bno_cur_lt = NULL;
         bno_cur_gt = NULL;
@@ -976,6 +1005,8 @@ restart:
                                                   &ltbnoa, &ltlena);
                         if (ltlena < args->minlen)
                                 continue;
+                       if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
+                               continue;
                         args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                         xfs_alloc_fix_len(args);
                         ASSERT(args->len >= args->minlen);
@@ -1096,11 +1127,11 @@ restart:
                         XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                         xfs_alloc_compute_aligned(args, ltbno, ltlen,
                                                   &ltbnoa, &ltlena);
-                       if (ltlena >= args->minlen)
+                       if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
                                 break;
                         if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
                                 goto error0;
-                       if (!i) {
+                       if (!i || ltbnoa < args->min_agbno) {
                                 xfs_btree_del_cursor(bno_cur_lt,
                                                      XFS_BTREE_NOERROR);
                                 bno_cur_lt = NULL;
@@ -1112,11 +1143,11 @@ restart:
                         XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                         xfs_alloc_compute_aligned(args, gtbno, gtlen,
                                                   &gtbnoa, &gtlena);
-                       if (gtlena >= args->minlen)
+                       if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
                                 break;
                         if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
                                 goto error0;
-                       if (!i) {
+                       if (!i || gtbnoa > args->max_agbno) {
                                 xfs_btree_del_cursor(bno_cur_gt,
                                                      XFS_BTREE_NOERROR);
                                 bno_cur_gt = NULL;
@@ -1216,6 +1247,7 @@ restart:
         ASSERT(ltnew >= ltbno);
         ASSERT(ltnew + rlen <= ltbnoa + ltlena);
         ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+       ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno);
         args->agbno = ltnew;
  
         if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
@@ -1825,11 +1857,11 @@ xfs_alloc_compute_maxlevels(
  xfs_extlen_t
  xfs_alloc_longest_free_extent(
         struct xfs_mount        *mp,
-       struct xfs_perag        *pag)
+       struct xfs_perag        *pag,
+       xfs_extlen_t            need)
  {
-       xfs_extlen_t            need, delta = 0;
+       xfs_extlen_t            delta = 0;
  
-       need = XFS_MIN_FREELIST_PAG(pag, mp);
         if (need > pag->pagf_flcount)
                 delta = need - pag->pagf_flcount;
  
@@ -1838,131 +1870,150 @@ xfs_alloc_longest_free_extent(
         return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
  }
  
+unsigned int
+xfs_alloc_min_freelist(
+       struct xfs_mount        *mp,
+       struct xfs_perag        *pag)
+{
+       unsigned int            min_free;
+
+       /* space needed by-bno freespace btree */
+       min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1,
+                                      mp->m_ag_maxlevels);
+       /* space needed by-size freespace btree */
+       min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
+                                      mp->m_ag_maxlevels);
+
+       return min_free;
+}
+
+/*
+ * Check if the operation we are fixing up the freelist for should go ahead or
+ * not. If we are freeing blocks, we always allow it, otherwise the allocation
+ * is dependent on whether the size and shape of free space available will
+ * permit the requested allocation to take place.
+ */
+static bool
+xfs_alloc_space_available(
+       struct xfs_alloc_arg    *args,
+       xfs_extlen_t            min_free,
+       int                     flags)
+{
+       struct xfs_perag        *pag = args->pag;
+       xfs_extlen_t            longest;
+       int                     available;
+
+       if (flags & XFS_ALLOC_FLAG_FREEING)
+               return true;
+
+       /* do we have enough contiguous free space for the allocation? */
+       longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free);
+       if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
+               return false;
+
+       /* do have enough free space remaining for the allocation? */
+       available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
+                         min_free - args->total);
+       if (available < (int)args->minleft)
+               return false;
+
+       return true;
+}
+
  /*
   * Decide whether to use this allocation group for this allocation.
   * If so, fix up the btree freelist's size.
   */
  STATIC int                     /* error */
  xfs_alloc_fix_freelist(
-       xfs_alloc_arg_t *args,  /* allocation argument structure */
-       int             flags)  /* XFS_ALLOC_FLAG_... */
+       struct xfs_alloc_arg    *args,  /* allocation argument structure */
+       int                     flags)  /* XFS_ALLOC_FLAG_... */
  {
-       xfs_buf_t       *agbp;  /* agf buffer pointer */
-       xfs_agf_t       *agf;   /* a.g. freespace structure pointer */
-       xfs_buf_t       *agflbp;/* agfl buffer pointer */
-       xfs_agblock_t   bno;    /* freelist block */
-       xfs_extlen_t    delta;  /* new blocks needed in freelist */
-       int             error;  /* error result code */
-       xfs_extlen_t    longest;/* longest extent in allocation group */
-       xfs_mount_t     *mp;    /* file system mount point structure */
-       xfs_extlen_t    need;   /* total blocks needed in freelist */
-       xfs_perag_t     *pag;   /* per-ag information structure */
-       xfs_alloc_arg_t targs;  /* local allocation arguments */
-       xfs_trans_t     *tp;    /* transaction pointer */
-
-       mp = args->mp;
+       struct xfs_mount        *mp = args->mp;
+       struct xfs_perag        *pag = args->pag;
+       struct xfs_trans        *tp = args->tp;
+       struct xfs_buf          *agbp = NULL;
+       struct xfs_buf          *agflbp = NULL;
+       struct xfs_alloc_arg    targs;  /* local allocation arguments */
+       xfs_agblock_t           bno;    /* freelist block */
+       xfs_extlen_t            need;   /* total blocks needed in freelist */
+       int                     error;
  
-       pag = args->pag;
-       tp = args->tp;
         if (!pag->pagf_init) {
-               if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
-                               &agbp)))
-                       return error;
+               error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+               if (error)
+                       goto out_no_agbp;
                 if (!pag->pagf_init) {
                         ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
                         ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-                       args->agbp = NULL;
-                       return 0;
+                       goto out_agbp_relse;
                 }
-       } else
-               agbp = NULL;
+       }
  
         /*
-        * If this is a metadata preferred pag and we are user data
-        * then try somewhere else if we are not being asked to
-        * try harder at this point
+        * If this is a metadata preferred pag and we are user data then try
+        * somewhere else if we are not being asked to try harder at this
+        * point
          */
         if (pag->pagf_metadata && args->userdata &&
             (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
                 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-               args->agbp = NULL;
-               return 0;
+               goto out_agbp_relse;
         }
  
-       if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
-               /*
-                * If it looks like there isn't a long enough extent, or enough
-                * total blocks, reject it.
-                */
-               need = XFS_MIN_FREELIST_PAG(pag, mp);
-               longest = xfs_alloc_longest_free_extent(mp, pag);
-               if ((args->minlen + args->alignment + args->minalignslop - 1) >
-                               longest ||
-                   ((int)(pag->pagf_freeblks + pag->pagf_flcount -
-                          need - args->total) < (int)args->minleft)) {
-                       if (agbp)
-                               xfs_trans_brelse(tp, agbp);
-                       args->agbp = NULL;
-                       return 0;
-               }
-       }
+       need = xfs_alloc_min_freelist(mp, pag);
+       if (!xfs_alloc_space_available(args, need, flags))
+               goto out_agbp_relse;
  
         /*
          * Get the a.g. freespace buffer.
          * Can fail if we're not blocking on locks, and it's held.
          */
-       if (agbp == NULL) {
-               if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
-                               &agbp)))
-                       return error;
-               if (agbp == NULL) {
+       if (!agbp) {
+               error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+               if (error)
+                       goto out_no_agbp;
+               if (!agbp) {
                         ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
                         ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-                       args->agbp = NULL;
-                       return 0;
-               }
-       }
-       /*
-        * Figure out how many blocks we should have in the freelist.
-        */
-       agf = XFS_BUF_TO_AGF(agbp);
-       need = XFS_MIN_FREELIST(agf, mp);
-       /*
-        * If there isn't enough total or single-extent, reject it.
-        */
-       if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
-               delta = need > be32_to_cpu(agf->agf_flcount) ?
-                       (need - be32_to_cpu(agf->agf_flcount)) : 0;
-               longest = be32_to_cpu(agf->agf_longest);
-               longest = (longest > delta) ? (longest - delta) :
-                       (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
-               if ((args->minlen + args->alignment + args->minalignslop - 1) >
-                               longest ||
-                   ((int)(be32_to_cpu(agf->agf_freeblks) +
-                    be32_to_cpu(agf->agf_flcount) - need - args->total) <
-                               (int)args->minleft)) {
-                       xfs_trans_brelse(tp, agbp);
-                       args->agbp = NULL;
-                       return 0;
+                       goto out_no_agbp;
                 }
         }
+
+       /* If there isn't enough total space or single-extent, reject it. */
+       need = xfs_alloc_min_freelist(mp, pag);
+       if (!xfs_alloc_space_available(args, need, flags))
+               goto out_agbp_relse;
+
         /*
          * Make the freelist shorter if it's too long.
+        *
+        * Note that from this point onwards, we will always release the agf and
+        * agfl buffers on error. This handles the case where we error out and
+        * the buffers are clean or may not have been joined to the transaction
+        * and hence need to be released manually. If they have been joined to
+        * the transaction, then xfs_trans_brelse() will handle them
+        * appropriately based on the recursion count and dirty state of the
+        * buffer.
+        *
+        * XXX (dgc): When we have lots of free space, does this buy us
+        * anything other than extra overhead when we need to put more blocks
+        * back on the free list? Maybe we should only do this when space is
+        * getting low or the AGFL is more than half full?
          */
-       while (be32_to_cpu(agf->agf_flcount) > need) {
-               xfs_buf_t       *bp;
+       while (pag->pagf_flcount > need) {
+               struct xfs_buf  *bp;
  
                 error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
                 if (error)
-                       return error;
-               if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
-                       return error;
+                       goto out_agbp_relse;
+               error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1);
+               if (error)
+                       goto out_agbp_relse;
                 bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
                 xfs_trans_binval(tp, bp);
         }
-       /*
-        * Initialize the args structure.
-        */
+
         memset(&targs, 0, sizeof(targs));
         targs.tp = tp;
         targs.mp = mp;
@@ -1971,21 +2022,20 @@ xfs_alloc_fix_freelist(
         targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
         targs.type = XFS_ALLOCTYPE_THIS_AG;
         targs.pag = pag;
-       if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
-               return error;
-       /*
-        * Make the freelist longer if it's too short.
-        */
-       while (be32_to_cpu(agf->agf_flcount) < need) {
+       error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
+       if (error)
+               goto out_agbp_relse;
+
+       /* Make the freelist longer if it's too short. */
+       while (pag->pagf_flcount < need) {
                 targs.agbno = 0;
-               targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
-               /*
-                * Allocate as many blocks as possible at once.
-                */
-               if ((error = xfs_alloc_ag_vextent(&targs))) {
-                       xfs_trans_brelse(tp, agflbp);
-                       return error;
-               }
+               targs.maxlen = need - pag->pagf_flcount;
+
+               /* Allocate as many blocks as possible at once. */
+               error = xfs_alloc_ag_vextent(&targs);
+               if (error)
+                       goto out_agflbp_relse;
+
                 /*
                  * Stop if we run out.  Won't happen if callers are obeying
                  * the restrictions correctly.  Can happen for free calls
@@ -1994,9 +2044,7 @@ xfs_alloc_fix_freelist(
                 if (targs.agbno == NULLAGBLOCK) {
                         if (flags & XFS_ALLOC_FLAG_FREEING)
                                 break;
-                       xfs_trans_brelse(tp, agflbp);
-                       args->agbp = NULL;
-                       return 0;
+                       goto out_agflbp_relse;
                 }
                 /*
                  * Put each allocated block on the list.
@@ -2005,12 +2053,21 @@ xfs_alloc_fix_freelist(
                         error = xfs_alloc_put_freelist(tp, agbp,
                                                         agflbp, bno, 0);
                         if (error)
-                               return error;
+                               goto out_agflbp_relse;
                 }
         }
         xfs_trans_brelse(tp, agflbp);
         args->agbp = agbp;
         return 0;
+
+out_agflbp_relse:
+       xfs_trans_brelse(tp, agflbp);
+out_agbp_relse:
+       if (agbp)
+               xfs_trans_brelse(tp, agbp);
+out_no_agbp:
+       args->agbp = NULL;
+       return error;
  }
  
  /*
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h

index d1b4b6a..ca1c816 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -112,6 +112,8 @@ typedef struct xfs_alloc_arg {
         xfs_extlen_t    total;          /* total blocks needed in xaction */
         xfs_extlen_t    alignment;      /* align answer to multiple of this */
         xfs_extlen_t    minalignslop;   /* slop for minlen+alignment calcs */
+       xfs_agblock_t   min_agbno;      /* set an agbno range for NEAR allocs */
+       xfs_agblock_t   max_agbno;      /* ... */
         xfs_extlen_t    len;            /* output: actual size of extent */
         xfs_alloctype_t type;           /* allocation type XFS_ALLOCTYPE_... */
         xfs_alloctype_t otype;          /* original allocation type */
@@ -128,11 +130,9 @@ typedef struct xfs_alloc_arg {
  #define XFS_ALLOC_USERDATA             1       /* allocation is for user data*/
  #define XFS_ALLOC_INITIAL_USER_DATA    2       /* special case start of file */
  
-/*
- * Find the length of the longest extent in an AG.
- */
-xfs_extlen_t
-xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+               struct xfs_perag *pag, xfs_extlen_t need);
+unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
                 struct xfs_perag *pag);
  
  /*
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c

index 0a472fb..3349c9a 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -266,7 +266,7 @@ xfs_attr_set(
         tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
         error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
         if (error) {
-               xfs_trans_cancel(args.trans, 0);
+               xfs_trans_cancel(args.trans);
                 return error;
         }
         xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -276,7 +276,7 @@ xfs_attr_set(
                                        XFS_QMOPT_RES_REGBLKS);
         if (error) {
                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
-               xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+               xfs_trans_cancel(args.trans);
                 return error;
         }
  
@@ -320,8 +320,7 @@ xfs_attr_set(
                                 xfs_trans_ichgtime(args.trans, dp,
                                                         XFS_ICHGTIME_CHG);
                         }
-                       err2 = xfs_trans_commit(args.trans,
-                                                XFS_TRANS_RELEASE_LOG_RES);
+                       err2 = xfs_trans_commit(args.trans);
                         xfs_iunlock(dp, XFS_ILOCK_EXCL);
  
                         return error ? error : err2;
@@ -383,16 +382,14 @@ xfs_attr_set(
          * Commit the last in the sequence of transactions.
          */
         xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
-       error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(args.trans);
         xfs_iunlock(dp, XFS_ILOCK_EXCL);
  
         return error;
  
  out:
-       if (args.trans) {
-               xfs_trans_cancel(args.trans,
-                       XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-       }
+       if (args.trans)
+               xfs_trans_cancel(args.trans);
         xfs_iunlock(dp, XFS_ILOCK_EXCL);
         return error;
  }
@@ -462,7 +459,7 @@ xfs_attr_remove(
         error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
                                   XFS_ATTRRM_SPACE_RES(mp), 0);
         if (error) {
-               xfs_trans_cancel(args.trans, 0);
+               xfs_trans_cancel(args.trans);
                 return error;
         }
  
@@ -501,16 +498,14 @@ xfs_attr_remove(
          * Commit the last in the sequence of transactions.
          */
         xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
-       error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(args.trans);
         xfs_iunlock(dp, XFS_ILOCK_EXCL);
  
         return error;
  
  out:
-       if (args.trans) {
-               xfs_trans_cancel(args.trans,
-                       XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-       }
+       if (args.trans)
+               xfs_trans_cancel(args.trans);
         xfs_iunlock(dp, XFS_ILOCK_EXCL);
         return error;
  }
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c

index f1026e8..63e05b6 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1112,7 +1112,6 @@ xfs_bmap_add_attrfork(
         int                     committed;      /* xaction was committed */
         int                     logflags;       /* logging flags */
         int                     error;          /* error return value */
-       int                     cancel_flags = 0;
  
         ASSERT(XFS_IFORK_Q(ip) == 0);
  
@@ -1124,17 +1123,15 @@ xfs_bmap_add_attrfork(
                 tp->t_flags |= XFS_TRANS_RESERVE;
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return error;
         }
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
         xfs_ilock(ip, XFS_ILOCK_EXCL);
         error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
                         XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
                         XFS_QMOPT_RES_REGBLKS);
         if (error)
                 goto trans_cancel;
-       cancel_flags |= XFS_TRANS_ABORT;
         if (XFS_IFORK_Q(ip))
                 goto trans_cancel;
         if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
@@ -1218,14 +1215,14 @@ xfs_bmap_add_attrfork(
         error = xfs_bmap_finish(&tp, &flist, &committed);
         if (error)
                 goto bmap_cancel;
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return error;
  
  bmap_cancel:
         xfs_bmap_cancel(&flist);
  trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return error;
  }
@@ -3521,7 +3518,8 @@ xfs_bmap_longest_free_extent(
                 }
         }
  
-       longest = xfs_alloc_longest_free_extent(mp, pag);
+       longest = xfs_alloc_longest_free_extent(mp, pag,
+                                       xfs_alloc_min_freelist(mp, pag));
         if (*blen < longest)
                 *blen = longest;
  
@@ -4424,7 +4422,15 @@ xfs_bmapi_convert_unwritten(
         error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
                         &bma->cur, mval, bma->firstblock, bma->flist,
                         &tmp_logflags);
-       bma->logflags |= tmp_logflags;
+       /*
+        * Log the inode core unconditionally in the unwritten extent conversion
+        * path because the conversion might not have done so (e.g., if the
+        * extent count hasn't changed). We need to make sure the inode is dirty
+        * in the transaction for the sake of fsync(), even if nothing has
+        * changed, because fsync() will not force the log for this transaction
+        * unless it sees the inode pinned.
+        */
+       bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
         if (error)
                 return error;
  
@@ -5918,7 +5924,7 @@ xfs_bmap_split_extent(
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
                         XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return error;
         }
  
@@ -5936,10 +5942,9 @@ xfs_bmap_split_extent(
         if (error)
                 goto out;
  
-       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
+       return xfs_trans_commit(tp);
  
  out:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
         return error;
  }
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h

index 4daaa66..a0ae572 100644 (file)
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -170,7 +170,7 @@ typedef struct xfs_sb {
         __uint32_t      sb_features_log_incompat;
  
         __uint32_t      sb_crc;         /* superblock crc */
-       __uint32_t      sb_pad;
+       xfs_extlen_t    sb_spino_align; /* sparse inode chunk alignment */
  
         xfs_ino_t       sb_pquotino;    /* project quota inode */
         xfs_lsn_t       sb_lsn;         /* last write sequence */
@@ -256,7 +256,7 @@ typedef struct xfs_dsb {
         __be32          sb_features_log_incompat;
  
         __le32          sb_crc;         /* superblock crc */
-       __be32          sb_pad;
+       __be32          sb_spino_align; /* sparse inode chunk alignment */
  
         __be64          sb_pquotino;    /* project quota inode */
         __be64          sb_lsn;         /* last write sequence */
@@ -457,8 +457,10 @@ xfs_sb_has_ro_compat_feature(
  }
  
  #define XFS_SB_FEAT_INCOMPAT_FTYPE     (1 << 0)        /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_SPINODES  (1 << 1)        /* sparse inode chunks */
  #define XFS_SB_FEAT_INCOMPAT_ALL \
-               (XFS_SB_FEAT_INCOMPAT_FTYPE)
+               (XFS_SB_FEAT_INCOMPAT_FTYPE|    \
+                XFS_SB_FEAT_INCOMPAT_SPINODES)
  
  #define XFS_SB_FEAT_INCOMPAT_UNKNOWN   ~XFS_SB_FEAT_INCOMPAT_ALL
  static inline bool
@@ -506,6 +508,12 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
                 (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
  }
  
+static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp)
+{
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+               xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES);
+}
+
  /*
   * end of superblock version macros
   */
@@ -758,19 +766,6 @@ typedef struct xfs_agfl {
  
  #define XFS_AGFL_CRC_OFF       offsetof(struct xfs_agfl, agfl_crc)
  
-
-#define        XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
-#define        XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
-       (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
-#define        XFS_MIN_FREELIST(a,mp)          \
-       (XFS_MIN_FREELIST_RAW(          \
-               be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
-               be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
-#define        XFS_MIN_FREELIST_PAG(pag,mp)    \
-       (XFS_MIN_FREELIST_RAW(          \
-               (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
-               (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
-
  #define XFS_AGB_TO_FSB(mp,agno,agbno)  \
         (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
  #define        XFS_FSB_TO_AGNO(mp,fsbno)       \
@@ -1216,26 +1211,54 @@ typedef __uint64_t      xfs_inofree_t;
  #define        XFS_INOBT_ALL_FREE              ((xfs_inofree_t)-1)
  #define        XFS_INOBT_MASK(i)               ((xfs_inofree_t)1 << (i))
  
+#define XFS_INOBT_HOLEMASK_FULL                0       /* holemask for full chunk */
+#define XFS_INOBT_HOLEMASK_BITS                (NBBY * sizeof(__uint16_t))
+#define XFS_INODES_PER_HOLEMASK_BIT    \
+       (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t)))
+
  static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
  {
         return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
  }
  
  /*
- * Data record structure
+ * The on-disk inode record structure has two formats. The original "full"
+ * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount
+ * and replaces the 3 high-order freecount bytes wth the holemask and inode
+ * count.
+ *
+ * The holemask of the sparse record format allows an inode chunk to have holes
+ * that refer to blocks not owned by the inode record. This facilitates inode
+ * allocation in the event of severe free space fragmentation.
   */
  typedef struct xfs_inobt_rec {
         __be32          ir_startino;    /* starting inode number */
-       __be32          ir_freecount;   /* count of free inodes (set bits) */
+       union {
+               struct {
+                       __be32  ir_freecount;   /* count of free inodes */
+               } f;
+               struct {
+                       __be16  ir_holemask;/* hole mask for sparse chunks */
+                       __u8    ir_count;       /* total inode count */
+                       __u8    ir_freecount;   /* count of free inodes */
+               } sp;
+       } ir_u;
         __be64          ir_free;        /* free inode mask */
  } xfs_inobt_rec_t;
  
  typedef struct xfs_inobt_rec_incore {
         xfs_agino_t     ir_startino;    /* starting inode number */
-       __int32_t       ir_freecount;   /* count of free inodes (set bits) */
+       __uint16_t      ir_holemask;    /* hole mask for sparse chunks */
+       __uint8_t       ir_count;       /* total inode count */
+       __uint8_t       ir_freecount;   /* count of free inodes (set bits) */
         xfs_inofree_t   ir_free;        /* free inode mask */
  } xfs_inobt_rec_incore_t;
  
+static inline bool xfs_inobt_issparse(uint16_t holemask)
+{
+       /* non-zero holemask represents a sparse rec. */
+       return holemask;
+}
  
  /*
   * Key structure
@@ -1453,8 +1476,8 @@ struct xfs_acl {
                 sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
  
  /* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE           (unsigned char *)"SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT                (unsigned char *)"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE           "SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT                "SGI_ACL_DEFAULT"
  #define SGI_ACL_FILE_SIZE      (sizeof(SGI_ACL_FILE)-1)
  #define SGI_ACL_DEFAULT_SIZE   (sizeof(SGI_ACL_DEFAULT)-1)
  
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h

index 18dc721..89689c6 100644 (file)
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks {
  #define XFS_FSOP_GEOM_FLAGS_V5SB       0x8000  /* version 5 superblock */
  #define XFS_FSOP_GEOM_FLAGS_FTYPE      0x10000 /* inode directory types */
  #define XFS_FSOP_GEOM_FLAGS_FINOBT     0x20000 /* free inode btree */
+#define XFS_FSOP_GEOM_FLAGS_SPINODES   0x40000 /* sparse inode chunks  */
  
  /*
   * Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c

index 1c9e755..66efc70 100644 (file)
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -65,6 +65,8 @@ xfs_inobt_lookup(
         int                     *stat)  /* success/failure */
  {
         cur->bc_rec.i.ir_startino = ino;
+       cur->bc_rec.i.ir_holemask = 0;
+       cur->bc_rec.i.ir_count = 0;
         cur->bc_rec.i.ir_freecount = 0;
         cur->bc_rec.i.ir_free = 0;
         return xfs_btree_lookup(cur, dir, stat);
@@ -82,7 +84,14 @@ xfs_inobt_update(
         union xfs_btree_rec     rec;
  
         rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
-       rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+       if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+               rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
+               rec.inobt.ir_u.sp.ir_count = irec->ir_count;
+               rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
+       } else {
+               /* ir_holemask/ir_count not supported on-disk */
+               rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
+       }
         rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
         return xfs_btree_update(cur, &rec);
  }
@@ -100,12 +109,27 @@ xfs_inobt_get_rec(
         int                     error;
  
         error = xfs_btree_get_rec(cur, &rec, stat);
-       if (!error && *stat == 1) {
-               irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
-               irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
-               irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+       if (error || *stat == 0)
+               return error;
+
+       irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+       if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+               irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
+               irec->ir_count = rec->inobt.ir_u.sp.ir_count;
+               irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
+       } else {
+               /*
+                * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
+                * values for full inode chunks.
+                */
+               irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
+               irec->ir_count = XFS_INODES_PER_CHUNK;
+               irec->ir_freecount =
+                               be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
         }
-       return error;
+       irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+
+       return 0;
  }
  
  /*
@@ -114,10 +138,14 @@ xfs_inobt_get_rec(
  STATIC int
  xfs_inobt_insert_rec(
         struct xfs_btree_cur    *cur,
+       __uint16_t              holemask,
+       __uint8_t               count,
         __int32_t               freecount,
         xfs_inofree_t           free,
         int                     *stat)
  {
+       cur->bc_rec.i.ir_holemask = holemask;
+       cur->bc_rec.i.ir_count = count;
         cur->bc_rec.i.ir_freecount = freecount;
         cur->bc_rec.i.ir_free = free;
         return xfs_btree_insert(cur, stat);
@@ -154,7 +182,9 @@ xfs_inobt_insert(
                 }
                 ASSERT(i == 0);
  
-               error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+               error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
+                                            XFS_INODES_PER_CHUNK,
+                                            XFS_INODES_PER_CHUNK,
                                              XFS_INOBT_ALL_FREE, &i);
                 if (error) {
                         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -220,6 +250,7 @@ xfs_ialloc_inode_init(
         struct xfs_mount        *mp,
         struct xfs_trans        *tp,
         struct list_head        *buffer_list,
+       int                     icount,
         xfs_agnumber_t          agno,
         xfs_agblock_t           agbno,
         xfs_agblock_t           length,
@@ -275,7 +306,7 @@ xfs_ialloc_inode_init(
                  * they track in the AIL as if they were physically logged.
                  */
                 if (tp)
-                       xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
+                       xfs_icreate_log(tp, agno, agbno, icount,
                                         mp->m_sb.sb_inodesize, length, gen);
         } else
                 version = 2;
@@ -346,6 +377,214 @@ xfs_ialloc_inode_init(
         return 0;
  }
  
+/*
+ * Align startino and allocmask for a recently allocated sparse chunk such that
+ * they are fit for insertion (or merge) into the on-disk inode btrees.
+ *
+ * Background:
+ *
+ * When enabled, sparse inode support increases the inode alignment from cluster
+ * size to inode chunk size. This means that the minimum range between two
+ * non-adjacent inode records in the inobt is large enough for a full inode
+ * record. This allows for cluster sized, cluster aligned block allocation
+ * without need to worry about whether the resulting inode record overlaps with
+ * another record in the tree. Without this basic rule, we would have to deal
+ * with the consequences of overlap by potentially undoing recent allocations in
+ * the inode allocation codepath.
+ *
+ * Because of this alignment rule (which is enforced on mount), there are two
+ * inobt possibilities for newly allocated sparse chunks. One is that the
+ * aligned inode record for the chunk covers a range of inodes not already
+ * covered in the inobt (i.e., it is safe to insert a new sparse record). The
+ * other is that a record already exists at the aligned startino that considers
+ * the newly allocated range as sparse. In the latter case, record content is
+ * merged in hope that sparse inode chunks fill to full chunks over time.
+ */
+STATIC void
+xfs_align_sparse_ino(
+       struct xfs_mount                *mp,
+       xfs_agino_t                     *startino,
+       uint16_t                        *allocmask)
+{
+       xfs_agblock_t                   agbno;
+       xfs_agblock_t                   mod;
+       int                             offset;
+
+       agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
+       mod = agbno % mp->m_sb.sb_inoalignmt;
+       if (!mod)
+               return;
+
+       /* calculate the inode offset and align startino */
+       offset = mod << mp->m_sb.sb_inopblog;
+       *startino -= offset;
+
+       /*
+        * Since startino has been aligned down, left shift allocmask such that
+        * it continues to represent the same physical inodes relative to the
+        * new startino.
+        */
+       *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
+}
+
+/*
+ * Determine whether the source inode record can merge into the target. Both
+ * records must be sparse, the inode ranges must match and there must be no
+ * allocation overlap between the records.
+ */
+STATIC bool
+__xfs_inobt_can_merge(
+       struct xfs_inobt_rec_incore     *trec,  /* tgt record */
+       struct xfs_inobt_rec_incore     *srec)  /* src record */
+{
+       uint64_t                        talloc;
+       uint64_t                        salloc;
+
+       /* records must cover the same inode range */
+       if (trec->ir_startino != srec->ir_startino)
+               return false;
+
+       /* both records must be sparse */
+       if (!xfs_inobt_issparse(trec->ir_holemask) ||
+           !xfs_inobt_issparse(srec->ir_holemask))
+               return false;
+
+       /* both records must track some inodes */
+       if (!trec->ir_count || !srec->ir_count)
+               return false;
+
+       /* can't exceed capacity of a full record */
+       if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
+               return false;
+
+       /* verify there is no allocation overlap */
+       talloc = xfs_inobt_irec_to_allocmask(trec);
+       salloc = xfs_inobt_irec_to_allocmask(srec);
+       if (talloc & salloc)
+               return false;
+
+       return true;
+}
+
+/*
+ * Merge the source inode record into the target. The caller must call
+ * __xfs_inobt_can_merge() to ensure the merge is valid.
+ */
+STATIC void
+__xfs_inobt_rec_merge(
+       struct xfs_inobt_rec_incore     *trec,  /* target */
+       struct xfs_inobt_rec_incore     *srec)  /* src */
+{
+       ASSERT(trec->ir_startino == srec->ir_startino);
+
+       /* combine the counts */
+       trec->ir_count += srec->ir_count;
+       trec->ir_freecount += srec->ir_freecount;
+
+       /*
+        * Merge the holemask and free mask. For both fields, 0 bits refer to
+        * allocated inodes. We combine the allocated ranges with bitwise AND.
+        */
+       trec->ir_holemask &= srec->ir_holemask;
+       trec->ir_free &= srec->ir_free;
+}
+
+/*
+ * Insert a new sparse inode chunk into the associated inode btree. The inode
+ * record for the sparse chunk is pre-aligned to a startino that should match
+ * any pre-existing sparse inode record in the tree. This allows sparse chunks
+ * to fill over time.
+ *
+ * This function supports two modes of handling preexisting records depending on
+ * the merge flag. If merge is true, the provided record is merged with the
+ * existing record and updated in place. The merged record is returned in nrec.
+ * If merge is false, an existing record is replaced with the provided record.
+ * If no preexisting record exists, the provided record is always inserted.
+ *
+ * It is considered corruption if a merge is requested and not possible. Given
+ * the sparse inode alignment constraints, this should never happen.
+ */
+STATIC int
+xfs_inobt_insert_sprec(
+       struct xfs_mount                *mp,
+       struct xfs_trans                *tp,
+       struct xfs_buf                  *agbp,
+       int                             btnum,
+       struct xfs_inobt_rec_incore     *nrec,  /* in/out: new/merged rec. */
+       bool                            merge)  /* merge or replace */
+{
+       struct xfs_btree_cur            *cur;
+       struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
+       xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
+       int                             error;
+       int                             i;
+       struct xfs_inobt_rec_incore     rec;
+
+       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+       /* the new record is pre-aligned so we know where to look */
+       error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
+       if (error)
+               goto error;
+       /* if nothing there, insert a new record and return */
+       if (i == 0) {
+               error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
+                                            nrec->ir_count, nrec->ir_freecount,
+                                            nrec->ir_free, &i);
+               if (error)
+                       goto error;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+
+               goto out;
+       }
+
+       /*
+        * A record exists at this startino. Merge or replace the record
+        * depending on what we've been asked to do.
+        */
+       if (merge) {
+               error = xfs_inobt_get_rec(cur, &rec, &i);
+               if (error)
+                       goto error;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+               XFS_WANT_CORRUPTED_GOTO(mp,
+                                       rec.ir_startino == nrec->ir_startino,
+                                       error);
+
+               /*
+                * This should never fail. If we have coexisting records that
+                * cannot merge, something is seriously wrong.
+                */
+               XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
+                                       error);
+
+               trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
+                                        rec.ir_holemask, nrec->ir_startino,
+                                        nrec->ir_holemask);
+
+               /* merge to nrec to output the updated record */
+               __xfs_inobt_rec_merge(nrec, &rec);
+
+               trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
+                                         nrec->ir_holemask);
+
+               error = xfs_inobt_rec_check_count(mp, nrec);
+               if (error)
+                       goto error;
+       }
+
+       error = xfs_inobt_update(cur, nrec);
+       if (error)
+               goto error;
+
+out:
+       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+       return 0;
+error:
+       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+       return error;
+}
+
  /*
   * Allocate new inodes in the allocation group specified by agbp.
   * Return 0 for success, else error code.
@@ -364,11 +603,22 @@ xfs_ialloc_ag_alloc(
         xfs_agino_t     newlen;         /* new number of inodes */
         int             isaligned = 0;  /* inode allocation at stripe unit */
                                         /* boundary */
+       uint16_t        allocmask = (uint16_t) -1; /* init. to full chunk */
+       struct xfs_inobt_rec_incore rec;
         struct xfs_perag *pag;
+       int             do_sparse = 0;
  
         memset(&args, 0, sizeof(args));
         args.tp = tp;
         args.mp = tp->t_mountp;
+       args.fsbno = NULLFSBLOCK;
+
+#ifdef DEBUG
+       /* randomly do sparse inode allocations */
+       if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
+           args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks)
+               do_sparse = prandom_u32() & 1;
+#endif
  
         /*
          * Locking will ensure that we don't have two callers in here
@@ -390,6 +640,8 @@ xfs_ialloc_ag_alloc(
         agno = be32_to_cpu(agi->agi_seqno);
         args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
                      args.mp->m_ialloc_blks;
+       if (do_sparse)
+               goto sparse_alloc;
         if (likely(newino != NULLAGINO &&
                   (args.agbno < be32_to_cpu(agi->agi_length)))) {
                 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -428,8 +680,7 @@ xfs_ialloc_ag_alloc(
                  * subsequent requests.
                  */
                 args.minalignslop = 0;
-       } else
-               args.fsbno = NULLFSBLOCK;
+       }
  
         if (unlikely(args.fsbno == NULLFSBLOCK)) {
                 /*
@@ -480,6 +731,47 @@ xfs_ialloc_ag_alloc(
                         return error;
         }
  
+       /*
+        * Finally, try a sparse allocation if the filesystem supports it and
+        * the sparse allocation length is smaller than a full chunk.
+        */
+       if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
+           args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
+           args.fsbno == NULLFSBLOCK) {
+sparse_alloc:
+               args.type = XFS_ALLOCTYPE_NEAR_BNO;
+               args.agbno = be32_to_cpu(agi->agi_root);
+               args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+               args.alignment = args.mp->m_sb.sb_spino_align;
+               args.prod = 1;
+
+               args.minlen = args.mp->m_ialloc_min_blks;
+               args.maxlen = args.minlen;
+
+               /*
+                * The inode record will be aligned to full chunk size. We must
+                * prevent sparse allocation from AG boundaries that result in
+                * invalid inode records, such as records that start at agbno 0
+                * or extend beyond the AG.
+                *
+                * Set min agbno to the first aligned, non-zero agbno and max to
+                * the last aligned agbno that is at least one full chunk from
+                * the end of the AG.
+                */
+               args.min_agbno = args.mp->m_sb.sb_inoalignmt;
+               args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
+                                           args.mp->m_sb.sb_inoalignmt) -
+                                args.mp->m_ialloc_blks;
+
+               error = xfs_alloc_vextent(&args);
+               if (error)
+                       return error;
+
+               newlen = args.len << args.mp->m_sb.sb_inopblog;
+               ASSERT(newlen <= XFS_INODES_PER_CHUNK);
+               allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
+       }
+
         if (args.fsbno == NULLFSBLOCK) {
                 *alloc = 0;
                 return 0;
@@ -495,8 +787,8 @@ xfs_ialloc_ag_alloc(
          * rather than a linear progression to prevent the next generation
          * number from being easily guessable.
          */
-       error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
-                       args.len, prandom_u32());
+       error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno,
+                       args.agbno, args.len, prandom_u32());
  
         if (error)
                 return error;
@@ -504,6 +796,73 @@ xfs_ialloc_ag_alloc(
          * Convert the results.
          */
         newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+
+       if (xfs_inobt_issparse(~allocmask)) {
+               /*
+                * We've allocated a sparse chunk. Align the startino and mask.
+                */
+               xfs_align_sparse_ino(args.mp, &newino, &allocmask);
+
+               rec.ir_startino = newino;
+               rec.ir_holemask = ~allocmask;
+               rec.ir_count = newlen;
+               rec.ir_freecount = newlen;
+               rec.ir_free = XFS_INOBT_ALL_FREE;
+
+               /*
+                * Insert the sparse record into the inobt and allow for a merge
+                * if necessary. If a merge does occur, rec is updated to the
+                * merged record.
+                */
+               error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
+                                              &rec, true);
+               if (error == -EFSCORRUPTED) {
+                       xfs_alert(args.mp,
+       "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
+                                 XFS_AGINO_TO_INO(args.mp, agno,
+                                                  rec.ir_startino),
+                                 rec.ir_holemask, rec.ir_count);
+                       xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
+               }
+               if (error)
+                       return error;
+
+               /*
+                * We can't merge the part we've just allocated as for the inobt
+                * due to finobt semantics. The original record may or may not
+                * exist independent of whether physical inodes exist in this
+                * sparse chunk.
+                *
+                * We must update the finobt record based on the inobt record.
+                * rec contains the fully merged and up to date inobt record
+                * from the previous call. Set merge false to replace any
+                * existing record with this one.
+                */
+               if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+                       error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
+                                                      XFS_BTNUM_FINO, &rec,
+                                                      false);
+                       if (error)
+                               return error;
+               }
+       } else {
+               /* full chunk - insert new records to both btrees */
+               error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+                                        XFS_BTNUM_INO);
+               if (error)
+                       return error;
+
+               if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+                       error = xfs_inobt_insert(args.mp, tp, agbp, newino,
+                                                newlen, XFS_BTNUM_FINO);
+                       if (error)
+                               return error;
+               }
+       }
+
+       /*
+        * Update AGI counts and newino.
+        */
         be32_add_cpu(&agi->agi_count, newlen);
         be32_add_cpu(&agi->agi_freecount, newlen);
         pag = xfs_perag_get(args.mp, agno);
@@ -511,20 +870,6 @@ xfs_ialloc_ag_alloc(
         xfs_perag_put(pag);
         agi->agi_newino = cpu_to_be32(newino);
  
-       /*
-        * Insert records describing the new inode chunk into the btrees.
-        */
-       error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
-                                XFS_BTNUM_INO);
-       if (error)
-               return error;
-
-       if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
-               error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
-                                        XFS_BTNUM_FINO);
-               if (error)
-                       return error;
-       }
         /*
          * Log allocation group header fields
          */
@@ -645,7 +990,7 @@ xfs_ialloc_ag_select(
                  * if we fail allocation due to alignment issues then it is most
                  * likely a real ENOSPC condition.
                  */
-               ineed = mp->m_ialloc_blks;
+               ineed = mp->m_ialloc_min_blks;
                 if (flags && ineed > 1)
                         ineed += xfs_ialloc_cluster_alignment(mp);
                 longest = pag->pagf_longest;
@@ -731,6 +1076,27 @@ xfs_ialloc_get_rec(
         return 0;
  }
  
+/*
+ * Return the offset of the first free inode in the record. If the inode chunk
+ * is sparsely allocated, we convert the record holemask to inode granularity
+ * and mask off the unallocated regions from the inode free mask.
+ */
+STATIC int
+xfs_inobt_first_free_inode(
+       struct xfs_inobt_rec_incore     *rec)
+{
+       xfs_inofree_t                   realfree;
+
+       /* if there are no holes, return the first available offset */
+       if (!xfs_inobt_issparse(rec->ir_holemask))
+               return xfs_lowbit64(rec->ir_free);
+
+       realfree = xfs_inobt_irec_to_allocmask(rec);
+       realfree &= rec->ir_free;
+
+       return xfs_lowbit64(realfree);
+}
+
  /*
   * Allocate an inode using the inobt-only algorithm.
   */
@@ -961,7 +1327,7 @@ newino:
         }
  
  alloc_inode:
-       offset = xfs_lowbit64(rec.ir_free);
+       offset = xfs_inobt_first_free_inode(&rec);
         ASSERT(offset >= 0);
         ASSERT(offset < XFS_INODES_PER_CHUNK);
         ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1210,7 +1576,7 @@ xfs_dialloc_ag(
         if (error)
                 goto error_cur;
  
-       offset = xfs_lowbit64(rec.ir_free);
+       offset = xfs_inobt_first_free_inode(&rec);
         ASSERT(offset >= 0);
         ASSERT(offset < XFS_INODES_PER_CHUNK);
         ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1439,6 +1805,83 @@ out_error:
         return error;
  }
  
+/*
+ * Free the blocks of an inode chunk. We must consider that the inode chunk
+ * might be sparse and only free the regions that are allocated as part of the
+ * chunk.
+ */
+STATIC void
+xfs_difree_inode_chunk(
+       struct xfs_mount                *mp,
+       xfs_agnumber_t                  agno,
+       struct xfs_inobt_rec_incore     *rec,
+       struct xfs_bmap_free            *flist)
+{
+       xfs_agblock_t   sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
+       int             startidx, endidx;
+       int             nextbit;
+       xfs_agblock_t   agbno;
+       int             contigblk;
+       DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
+
+       if (!xfs_inobt_issparse(rec->ir_holemask)) {
+               /* not sparse, calculate extent info directly */
+               xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+                                 XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
+                                 mp->m_ialloc_blks, flist, mp);
+               return;
+       }
+
+       /* holemask is only 16-bits (fits in an unsigned long) */
+       ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
+       holemask[0] = rec->ir_holemask;
+
+       /*
+        * Find contiguous ranges of zeroes (i.e., allocated regions) in the
+        * holemask and convert the start/end index of each range to an extent.
+        * We start with the start and end index both pointing at the first 0 in
+        * the mask.
+        */
+       startidx = endidx = find_first_zero_bit(holemask,
+                                               XFS_INOBT_HOLEMASK_BITS);
+       nextbit = startidx + 1;
+       while (startidx < XFS_INOBT_HOLEMASK_BITS) {
+               nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
+                                            nextbit);
+               /*
+                * If the next zero bit is contiguous, update the end index of
+                * the current range and continue.
+                */
+               if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
+                   nextbit == endidx + 1) {
+                       endidx = nextbit;
+                       goto next;
+               }
+
+               /*
+                * nextbit is not contiguous with the current end index. Convert
+                * the current start/end to an extent and add it to the free
+                * list.
+                */
+               agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
+                                 mp->m_sb.sb_inopblock;
+               contigblk = ((endidx - startidx + 1) *
+                            XFS_INODES_PER_HOLEMASK_BIT) /
+                           mp->m_sb.sb_inopblock;
+
+               ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
+               ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
+               xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
+                                 flist, mp);
+
+               /* reset range to current bit and carry on... */
+               startidx = endidx = nextbit;
+
+next:
+               nextbit++;
+       }
+}
+
  STATIC int
  xfs_difree_inobt(
         struct xfs_mount                *mp,
@@ -1446,8 +1889,7 @@ xfs_difree_inobt(
         struct xfs_buf                  *agbp,
         xfs_agino_t                     agino,
         struct xfs_bmap_free            *flist,
-       int                             *deleted,
-       xfs_ino_t                       *first_ino,
+       struct xfs_icluster             *xic,
         struct xfs_inobt_rec_incore     *orec)
  {
         struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
@@ -1501,20 +1943,23 @@ xfs_difree_inobt(
         rec.ir_freecount++;
  
         /*
-        * When an inode cluster is free, it becomes eligible for removal
+        * When an inode chunk is free, it becomes eligible for removal. Don't
+        * remove the chunk if the block size is large enough for multiple inode
+        * chunks (that might not be free).
          */
         if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
-           (rec.ir_freecount == mp->m_ialloc_inos)) {
-
-               *deleted = 1;
-               *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+           rec.ir_free == XFS_INOBT_ALL_FREE &&
+           mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
+               xic->deleted = 1;
+               xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+               xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
  
                 /*
                  * Remove the inode cluster from the AGI B+Tree, adjust the
                  * AGI and Superblock inode counts, and mark the disk space
                  * to be freed when the transaction is committed.
                  */
-               ilen = mp->m_ialloc_inos;
+               ilen = rec.ir_freecount;
                 be32_add_cpu(&agi->agi_count, -ilen);
                 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
                 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1530,11 +1975,9 @@ xfs_difree_inobt(
                         goto error0;
                 }
  
-               xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
-                                 XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
-                                 mp->m_ialloc_blks, flist, mp);
+               xfs_difree_inode_chunk(mp, agno, &rec, flist);
         } else {
-               *deleted = 0;
+               xic->deleted = 0;
  
                 error = xfs_inobt_update(cur, &rec);
                 if (error) {
@@ -1599,7 +2042,9 @@ xfs_difree_finobt(
                  */
                 XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
  
-               error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+               error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
+                                            ibtrec->ir_count,
+                                            ibtrec->ir_freecount,
                                              ibtrec->ir_free, &i);
                 if (error)
                         goto error;
@@ -1634,8 +2079,13 @@ xfs_difree_finobt(
          * free inode. Hence, if all of the inodes are free and we aren't
          * keeping inode chunks permanently on disk, remove the record.
          * Otherwise, update the record with the new information.
+        *
+        * Note that we currently can't free chunks when the block size is large
+        * enough for multiple chunks. Leave the finobt record to remain in sync
+        * with the inobt.
          */
-       if (rec.ir_freecount == mp->m_ialloc_inos &&
+       if (rec.ir_free == XFS_INOBT_ALL_FREE &&
+           mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK &&
             !(mp->m_flags & XFS_MOUNT_IKEEP)) {
                 error = xfs_btree_delete(cur, &i);
                 if (error)
@@ -1671,8 +2121,7 @@ xfs_difree(
         struct xfs_trans        *tp,            /* transaction pointer */
         xfs_ino_t               inode,          /* inode to be freed */
         struct xfs_bmap_free    *flist,         /* extents to free */
-       int                     *deleted,/* set if inode cluster was deleted */
-       xfs_ino_t               *first_ino)/* first inode in deleted cluster */
+       struct xfs_icluster     *xic)   /* cluster info if deleted */
  {
         /* REFERENCED */
         xfs_agblock_t           agbno;  /* block number containing inode */
@@ -1723,8 +2172,7 @@ xfs_difree(
         /*
          * Fix up the inode allocation btree.
          */
-       error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
-                                &rec);
+       error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec);
         if (error)
                 goto error0;
  
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h

index 100007d..6e450df 100644 (file)
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -28,6 +28,13 @@ struct xfs_btree_cur;
  /* Move inodes in clusters of this size */
  #define        XFS_INODE_BIG_CLUSTER_SIZE      8192
  
+struct xfs_icluster {
+       bool            deleted;        /* record is deleted */
+       xfs_ino_t       first_ino;      /* first inode number */
+       uint64_t        alloc;          /* inode phys. allocation bitmap for
+                                        * sparse chunks */
+};
+
  /* Calculate and return the number of filesystem blocks per inode cluster */
  static inline int
  xfs_icluster_size_fsb(
@@ -44,8 +51,7 @@ xfs_icluster_size_fsb(
  static inline struct xfs_dinode *
  xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
  {
-       return (struct xfs_dinode *)
-               (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
+       return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog);
  }
  
  /*
@@ -90,8 +96,7 @@ xfs_difree(
         struct xfs_trans *tp,           /* transaction pointer */
         xfs_ino_t       inode,          /* inode to be freed */
         struct xfs_bmap_free *flist,    /* extents to free */
-       int             *deleted,       /* set if inode cluster was deleted */
-       xfs_ino_t       *first_ino);    /* first inode in deleted cluster */
+       struct xfs_icluster *ifree);    /* cluster info if deleted */
  
  /*
   * Return the location of the inode in imap, for mapping it into a buffer.
@@ -156,7 +161,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
   * Inode chunk initialisation routine
   */
  int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
-                         struct list_head *buffer_list,
+                         struct list_head *buffer_list, int icount,
                           xfs_agnumber_t agno, xfs_agblock_t agbno,
                           xfs_agblock_t length, unsigned int gen);
  
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c

index 964c465..674ad8f 100644 (file)
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur(
         union xfs_btree_rec     *rec)
  {
         rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
-       rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+       if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+               rec->inobt.ir_u.sp.ir_holemask =
+                                       cpu_to_be16(cur->bc_rec.i.ir_holemask);
+               rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count;
+               rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount;
+       } else {
+               /* ir_holemask/ir_count not supported on-disk */
+               rec->inobt.ir_u.f.ir_freecount =
+                                       cpu_to_be32(cur->bc_rec.i.ir_freecount);
+       }
         rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
  }
  
@@ -418,3 +427,85 @@ xfs_inobt_maxrecs(
                 return blocklen / sizeof(xfs_inobt_rec_t);
         return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
  }
+
+/*
+ * Convert the inode record holemask to an inode allocation bitmap. The inode
+ * allocation bitmap is inode granularity and specifies whether an inode is
+ * physically allocated on disk (not whether the inode is considered allocated
+ * or free by the fs).
+ *
+ * A bit value of 1 means the inode is allocated, a value of 0 means it is free.
+ */
+uint64_t
+xfs_inobt_irec_to_allocmask(
+       struct xfs_inobt_rec_incore     *rec)
+{
+       uint64_t                        bitmap = 0;
+       uint64_t                        inodespbit;
+       int                             nextbit;
+       uint                            allocbitmap;
+
+       /*
+        * The holemask has 16-bits for a 64 inode record. Therefore each
+        * holemask bit represents multiple inodes. Create a mask of bits to set
+        * in the allocmask for each holemask bit.
+        */
+       inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1;
+
+       /*
+        * Allocated inodes are represented by 0 bits in holemask. Invert the 0
+        * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask
+        * anything beyond the 16 holemask bits since this casts to a larger
+        * type.
+        */
+       allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1);
+
+       /*
+        * allocbitmap is the inverted holemask so every set bit represents
+        * allocated inodes. To expand from 16-bit holemask granularity to
+        * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target
+        * bitmap for every holemask bit.
+        */
+       nextbit = xfs_next_bit(&allocbitmap, 1, 0);
+       while (nextbit != -1) {
+               ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY));
+
+               bitmap |= (inodespbit <<
+                          (nextbit * XFS_INODES_PER_HOLEMASK_BIT));
+
+               nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1);
+       }
+
+       return bitmap;
+}
+
+#if defined(DEBUG) || defined(XFS_WARN)
+/*
+ * Verify that an in-core inode record has a valid inode count.
+ */
+int
+xfs_inobt_rec_check_count(
+       struct xfs_mount                *mp,
+       struct xfs_inobt_rec_incore     *rec)
+{
+       int                             inocount = 0;
+       int                             nextbit = 0;
+       uint64_t                        allocbmap;
+       int                             wordsz;
+
+       wordsz = sizeof(allocbmap) / sizeof(unsigned int);
+       allocbmap = xfs_inobt_irec_to_allocmask(rec);
+
+       nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit);
+       while (nextbit != -1) {
+               inocount++;
+               nextbit = xfs_next_bit((uint *) &allocbmap, wordsz,
+                                      nextbit + 1);
+       }
+
+       if (inocount != rec->ir_count)
+               return -EFSCORRUPTED;
+
+       return 0;
+}
+#endif /* DEBUG */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h

index d7ebea7..bd88453 100644 (file)
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
                 xfs_btnum_t);
  extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
  
+/* ir_holemask to inode allocation bitmap conversion */
+uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *);
+
+#if defined(DEBUG) || defined(XFS_WARN)
+int xfs_inobt_rec_check_count(struct xfs_mount *,
+                             struct xfs_inobt_rec_incore *);
+#else
+#define xfs_inobt_rec_check_count(mp, rec)     0
+#endif /* DEBUG */
+
  #endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c

index 002b6b3..6526e76 100644 (file)
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -46,8 +46,7 @@ xfs_inobp_check(
         j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
  
         for (i = 0; i < j; i++) {
-               dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-                                       i * mp->m_sb.sb_inodesize);
+               dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
                 if (!dip->di_next_unlinked)  {
                         xfs_alert(mp,
         "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
@@ -86,8 +85,7 @@ xfs_inode_buf_verify(
                 int             di_ok;
                 xfs_dinode_t    *dip;
  
-               dip = (struct xfs_dinode *)xfs_buf_offset(bp,
-                                       (i << mp->m_sb.sb_inodelog));
+               dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
                 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
                             XFS_DINODE_GOOD_VERSION(dip->di_version);
                 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
@@ -186,7 +184,7 @@ xfs_imap_to_bp(
         }
  
         *bpp = bp;
-       *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
+       *dipp = xfs_buf_offset(bp, imap->im_boffset);
         return 0;
  }
  
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c

index dc4bfc5..df9851c 100644 (file)
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -174,6 +174,27 @@ xfs_mount_validate_sb(
                         return -EFSCORRUPTED;
         }
  
+       /*
+        * Full inode chunks must be aligned to inode chunk size when
+        * sparse inodes are enabled to support the sparse chunk
+        * allocation algorithm and prevent overlapping inode records.
+        */
+       if (xfs_sb_version_hassparseinodes(sbp)) {
+               uint32_t        align;
+
+               xfs_alert(mp,
+       "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
+
+               align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
+                               >> sbp->sb_blocklog;
+               if (sbp->sb_inoalignmt != align) {
+                       xfs_warn(mp,
+"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
+                                sbp->sb_inoalignmt, align);
+                       return -EINVAL;
+               }
+       }
+
         if (unlikely(
             sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
                 xfs_warn(mp,
@@ -374,7 +395,7 @@ __xfs_sb_from_disk(
                                 be32_to_cpu(from->sb_features_log_incompat);
         /* crc is only used on disk, not in memory; just init to 0 here. */
         to->sb_crc = 0;
-       to->sb_pad = 0;
+       to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
         to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
         to->sb_lsn = be64_to_cpu(from->sb_lsn);
         /* Convert on-disk flags to in-memory flags? */
@@ -516,7 +537,7 @@ xfs_sb_to_disk(
                                 cpu_to_be32(from->sb_features_incompat);
                 to->sb_features_log_incompat =
                                 cpu_to_be32(from->sb_features_log_incompat);
-               to->sb_pad = 0;
+               to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
                 to->sb_lsn = cpu_to_be64(from->sb_lsn);
         }
  }
@@ -689,6 +710,11 @@ xfs_sb_mount_common(
         mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
                                         sbp->sb_inopblock);
         mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+
+       if (sbp->sb_spino_align)
+               mp->m_ialloc_min_blks = sbp->sb_spino_align;
+       else
+               mp->m_ialloc_min_blks = mp->m_ialloc_blks;
  }
  
  /*
@@ -792,12 +818,12 @@ xfs_sync_sb(
         tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return error;
         }
  
         xfs_log_sb(tp);
         if (wait)
                 xfs_trans_set_sync(tp);
-       return xfs_trans_commit(tp, 0);
+       return xfs_trans_commit(tp);
  }
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h

index 8dda4b3..5be5297 100644 (file)
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -181,12 +181,6 @@ int        xfs_log_calc_minimum_size(struct xfs_mount *);
  #define XFS_TRANS_RESERVE      0x20    /* OK to use reserved data blocks */
  #define XFS_TRANS_FREEZE_PROT  0x40    /* Transaction has elevated writer
                                            count in superblock */
-/*
- * Values for call flags parameter.
- */
-#define        XFS_TRANS_RELEASE_LOG_RES       0x4
-#define        XFS_TRANS_ABORT                 0x8
-
  /*
   * Field values for xfs_trans_mod_sb.
   */
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h

index 2d5bdfc..7978150 100644 (file)
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -73,9 +73,9 @@ struct xfs_trans_resv {
   * 2 trees * (2 blocks/level * max depth - 1) * block size
   */
  #define        XFS_ALLOCFREE_LOG_RES(mp,nx) \
-       ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+       ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1)))
  #define        XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
-       ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+       ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1)))
  
  /*
   * Per-directory log reservation for any directory change.
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h

index bf9c457..41e0428 100644 (file)
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -67,7 +67,7 @@
  #define        XFS_DIOSTRAT_SPACE_RES(mp, v)   \
         (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
  #define        XFS_GROWFS_SPACE_RES(mp)        \
-       (2 * XFS_AG_MAXLEVELS(mp))
+       (2 * (mp)->m_ag_maxlevels)
  #define        XFS_GROWFSRT_SPACE_RES(mp,b)    \
         ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
  #define        XFS_LINK_SPACE_RES(mp,nl)       \
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index e5099f2..3859f5e 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -109,7 +109,7 @@ xfs_setfilesize_trans_alloc(
  
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return error;
         }
  
@@ -145,7 +145,7 @@ xfs_setfilesize(
         isize = xfs_new_eof(ip, offset + size);
         if (!isize) {
                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return 0;
         }
  
@@ -155,7 +155,7 @@ xfs_setfilesize(
         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  
-       return xfs_trans_commit(tp, 0);
+       return xfs_trans_commit(tp);
  }
  
  STATIC int
@@ -1348,7 +1348,7 @@ __xfs_get_blocks(
         sector_t                iblock,
         struct buffer_head      *bh_result,
         int                     create,
-       int                     direct)
+       bool                    direct)
  {
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
@@ -1413,6 +1413,7 @@ __xfs_get_blocks(
                         if (error)
                                 return error;
                         new = 1;
+
                 } else {
                         /*
                          * Delalloc reservations do not require a transaction,
@@ -1507,49 +1508,29 @@ xfs_get_blocks(
         struct buffer_head      *bh_result,
         int                     create)
  {
-       return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+       return __xfs_get_blocks(inode, iblock, bh_result, create, false);
  }
  
-STATIC int
+int
  xfs_get_blocks_direct(
         struct inode            *inode,
         sector_t                iblock,
         struct buffer_head      *bh_result,
         int                     create)
  {
-       return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+       return __xfs_get_blocks(inode, iblock, bh_result, create, true);
  }
  
-/*
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC void
-xfs_end_io_direct_write(
-       struct kiocb            *iocb,
+static void
+__xfs_end_io_direct_write(
+       struct inode            *inode,
+       struct xfs_ioend        *ioend,
         loff_t                  offset,
-       ssize_t                 size,
-       void                    *private)
+       ssize_t                 size)
  {
-       struct inode            *inode = file_inode(iocb->ki_filp);
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_ioend        *ioend = private;
-
-       trace_xfs_gbmap_direct_endio(ip, offset, size,
-                                    ioend ? ioend->io_type : 0, NULL);
+       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
  
-       if (!ioend) {
-               ASSERT(offset + size <= i_size_read(inode));
-               return;
-       }
-
-       if (XFS_FORCED_SHUTDOWN(mp))
+       if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
                 goto out_end_io;
  
         /*
@@ -1586,10 +1567,10 @@ xfs_end_io_direct_write(
          * here can result in EOF moving backwards and Bad Things Happen when
          * that occurs.
          */
-       spin_lock(&ip->i_flags_lock);
+       spin_lock(&XFS_I(inode)->i_flags_lock);
         if (offset + size > i_size_read(inode))
                 i_size_write(inode, offset + size);
-       spin_unlock(&ip->i_flags_lock);
+       spin_unlock(&XFS_I(inode)->i_flags_lock);
  
         /*
          * If we are doing an append IO that needs to update the EOF on disk,
@@ -1606,6 +1587,98 @@ out_end_io:
         return;
  }
  
+/*
+ * Complete a direct I/O write request.
+ *
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
+ */
+STATIC void
+xfs_end_io_direct_write(
+       struct kiocb            *iocb,
+       loff_t                  offset,
+       ssize_t                 size,
+       void                    *private)
+{
+       struct inode            *inode = file_inode(iocb->ki_filp);
+       struct xfs_ioend        *ioend = private;
+
+       trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
+                                    ioend ? ioend->io_type : 0, NULL);
+
+       if (!ioend) {
+               ASSERT(offset + size <= i_size_read(inode));
+               return;
+       }
+
+       __xfs_end_io_direct_write(inode, ioend, offset, size);
+}
+
+/*
+ * For DAX we need a mapping buffer callback for unwritten extent conversion
+ * when page faults allocate blocks and then zero them. Note that in this
+ * case the mapping indicated by the ioend may extend beyond EOF. We most
+ * definitely do not want to extend EOF here, so we trim back the ioend size to
+ * EOF.
+ */
+#ifdef CONFIG_FS_DAX
+void
+xfs_end_io_dax_write(
+       struct buffer_head      *bh,
+       int                     uptodate)
+{
+       struct xfs_ioend        *ioend = bh->b_private;
+       struct inode            *inode = ioend->io_inode;
+       ssize_t                 size = ioend->io_size;
+
+       ASSERT(IS_DAX(ioend->io_inode));
+
+       /* if there was an error zeroing, then don't convert it */
+       if (!uptodate)
+               ioend->io_error = -EIO;
+
+       /*
+        * Trim update to EOF, so we don't extend EOF during unwritten extent
+        * conversion of partial EOF blocks.
+        */
+       spin_lock(&XFS_I(inode)->i_flags_lock);
+       if (ioend->io_offset + size > i_size_read(inode))
+               size = i_size_read(inode) - ioend->io_offset;
+       spin_unlock(&XFS_I(inode)->i_flags_lock);
+
+       __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
+
+}
+#else
+void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
+#endif
+
+static inline ssize_t
+xfs_vm_do_dio(
+       struct inode            *inode,
+       struct kiocb            *iocb,
+       struct iov_iter         *iter,
+       loff_t                  offset,
+       void                    (*endio)(struct kiocb   *iocb,
+                                        loff_t         offset,
+                                        ssize_t        size,
+                                        void           *private),
+       int                     flags)
+{
+       struct block_device     *bdev;
+
+       if (IS_DAX(inode))
+               return dax_do_io(iocb, inode, iter, offset,
+                                xfs_get_blocks_direct, endio, 0);
+
+       bdev = xfs_find_bdev_for_inode(inode);
+       return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
+                                    xfs_get_blocks_direct, endio, NULL, flags);
+}
+
  STATIC ssize_t
  xfs_vm_direct_IO(
         struct kiocb            *iocb,
@@ -1613,16 +1686,11 @@ xfs_vm_direct_IO(
         loff_t                  offset)
  {
         struct inode            *inode = iocb->ki_filp->f_mapping->host;
-       struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
  
-       if (iov_iter_rw(iter) == WRITE) {
-               return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-                                           xfs_get_blocks_direct,
-                                           xfs_end_io_direct_write, NULL,
-                                           DIO_ASYNC_EXTEND);
-       }
-       return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-                                   xfs_get_blocks_direct, NULL, NULL, 0);
+       if (iov_iter_rw(iter) == WRITE)
+               return xfs_vm_do_dio(inode, iocb, iter, offset,
+                                    xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
+       return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
  }
  
  /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h

index ac644e0..86afd1a 100644 (file)
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -53,7 +53,12 @@ typedef struct xfs_ioend {
  } xfs_ioend_t;
  
  extern const struct address_space_operations xfs_address_space_operations;
-extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+
+int    xfs_get_blocks(struct inode *inode, sector_t offset,
+                      struct buffer_head *map_bh, int create);
+int    xfs_get_blocks_direct(struct inode *inode, sector_t offset,
+                             struct buffer_head *map_bh, int create);
+void   xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
  
  extern void xfs_count_page_state(struct page *, int *, int *);
  
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c

index 3fbf167..2bb959a 100644 (file)
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -394,7 +394,6 @@ xfs_attr_inactive(
  {
         struct xfs_trans        *trans;
         struct xfs_mount        *mp;
-       int                     cancel_flags = 0;
         int                     lock_mode = XFS_ILOCK_SHARED;
         int                     error = 0;
  
@@ -423,7 +422,6 @@ xfs_attr_inactive(
                 goto out_cancel;
  
         lock_mode = XFS_ILOCK_EXCL;
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT;
         xfs_ilock(dp, lock_mode);
  
         if (!XFS_IFORK_Q(dp))
@@ -435,8 +433,14 @@ xfs_attr_inactive(
          */
         xfs_trans_ijoin(trans, dp, 0);
  
-       /* invalidate and truncate the attribute fork extents */
-       if (dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) {
+       /*
+        * Invalidate and truncate the attribute fork extents. Make sure the
+        * fork actually has attributes as otherwise the invalidation has no
+        * blocks to read and returns an error. In this case, just do the fork
+        * removal below.
+        */
+       if (xfs_inode_hasattr(dp) &&
+           dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) {
                 error = xfs_attr3_root_inactive(&trans, dp);
                 if (error)
                         goto out_cancel;
@@ -449,12 +453,12 @@ xfs_attr_inactive(
         /* Reset the attribute fork - this also destroys the in-core fork */
         xfs_attr_fork_remove(dp, trans);
  
-       error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(trans);
         xfs_iunlock(dp, lock_mode);
         return error;
  
  out_cancel:
-       xfs_trans_cancel(trans, cancel_flags);
+       xfs_trans_cancel(trans);
  out_destroy_fork:
         /* kill the in-core attr fork before we drop the inode lock */
         if (dp->i_afp)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

index a52bbd3..0f34886 100644 (file)
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -75,28 +75,20 @@ xfs_bmap_finish(
         xfs_efi_log_item_t      *efi;           /* extent free intention */
         int                     error;          /* error return value */
         xfs_bmap_free_item_t    *free;          /* free extent item */
-       struct xfs_trans_res    tres;           /* new log reservation */
         xfs_mount_t             *mp;            /* filesystem mount structure */
         xfs_bmap_free_item_t    *next;          /* next item on free list */
-       xfs_trans_t             *ntp;           /* new transaction pointer */
  
         ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
         if (flist->xbf_count == 0) {
                 *committed = 0;
                 return 0;
         }
-       ntp = *tp;
-       efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+       efi = xfs_trans_get_efi(*tp, flist->xbf_count);
         for (free = flist->xbf_first; free; free = free->xbfi_next)
-               xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+               xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
                         free->xbfi_blockcount);
  
-       tres.tr_logres = ntp->t_log_res;
-       tres.tr_logcount = ntp->t_log_count;
-       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-       ntp = xfs_trans_dup(*tp);
-       error = xfs_trans_commit(*tp, 0);
-       *tp = ntp;
+       error = xfs_trans_roll(tp, NULL);
         *committed = 1;
         /*
          * We have a new transaction, so we should return committed=1,
@@ -105,19 +97,10 @@ xfs_bmap_finish(
         if (error)
                 return error;
  
-       /*
-        * transaction commit worked ok so we can drop the extra ticket
-        * reference that we gained in xfs_trans_dup()
-        */
-       xfs_log_ticket_put(ntp->t_ticket);
-
-       error = xfs_trans_reserve(ntp, &tres, 0, 0);
-       if (error)
-               return error;
-       efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+       efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
         for (free = flist->xbf_first; free != NULL; free = next) {
                 next = free->xbfi_next;
-               if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+               if ((error = xfs_free_extent(*tp, free->xbfi_startblock,
                                 free->xbfi_blockcount))) {
                         /*
                          * The bmap free list will be cleaned up at a
@@ -127,7 +110,7 @@ xfs_bmap_finish(
                          * happens, since this transaction may not be
                          * dirty yet.
                          */
-                       mp = ntp->t_mountp;
+                       mp = (*tp)->t_mountp;
                         if (!XFS_FORCED_SHUTDOWN(mp))
                                 xfs_force_shutdown(mp,
                                                    (error == -EFSCORRUPTED) ?
@@ -135,7 +118,7 @@ xfs_bmap_finish(
                                                    SHUTDOWN_META_IO_ERROR);
                         return error;
                 }
-               xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+               xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock,
                         free->xbfi_blockcount);
                 xfs_bmap_del_free(flist, NULL, free);
         }
@@ -878,7 +861,7 @@ xfs_free_eofblocks(
  
                 if (need_iolock) {
                         if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
-                               xfs_trans_cancel(tp, 0);
+                               xfs_trans_cancel(tp);
                                 return -EAGAIN;
                         }
                 }
@@ -886,7 +869,7 @@ xfs_free_eofblocks(
                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
                 if (error) {
                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp, 0);
+                       xfs_trans_cancel(tp);
                         if (need_iolock)
                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
                         return error;
@@ -908,12 +891,9 @@ xfs_free_eofblocks(
                          * If we get an error at this point we simply don't
                          * bother truncating the file.
                          */
-                       xfs_trans_cancel(tp,
-                                        (XFS_TRANS_RELEASE_LOG_RES |
-                                         XFS_TRANS_ABORT));
+                       xfs_trans_cancel(tp);
                 } else {
-                       error = xfs_trans_commit(tp,
-                                               XFS_TRANS_RELEASE_LOG_RES);
+                       error = xfs_trans_commit(tp);
                         if (!error)
                                 xfs_inode_clear_eofblocks_tag(ip);
                 }
@@ -1026,7 +1006,7 @@ xfs_alloc_file_space(
                          * Free the transaction structure.
                          */
                         ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp, 0);
+                       xfs_trans_cancel(tp);
                         break;
                 }
                 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1053,7 +1033,7 @@ xfs_alloc_file_space(
                         goto error0;
                 }
  
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               error = xfs_trans_commit(tp);
                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
                 if (error) {
                         break;
@@ -1077,7 +1057,7 @@ error0:   /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
         xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
  
  error1:        /* Just cancel transaction */
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return error;
  }
@@ -1133,14 +1113,29 @@ xfs_zero_remaining_bytes(
                         break;
                 ASSERT(imap.br_blockcount >= 1);
                 ASSERT(imap.br_startoff == offset_fsb);
+               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+
+               if (imap.br_startblock == HOLESTARTBLOCK ||
+                   imap.br_state == XFS_EXT_UNWRITTEN) {
+                       /* skip the entire extent */
+                       lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
+                                                     imap.br_blockcount) - 1;
+                       continue;
+               }
+
                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
                 if (lastoffset > endoff)
                         lastoffset = endoff;
-               if (imap.br_startblock == HOLESTARTBLOCK)
-                       continue;
-               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-               if (imap.br_state == XFS_EXT_UNWRITTEN)
+
+               /* DAX can just zero the backing device directly */
+               if (IS_DAX(VFS_I(ip))) {
+                       error = dax_zero_page_range(VFS_I(ip), offset,
+                                                   lastoffset - offset + 1,
+                                                   xfs_get_blocks_direct);
+                       if (error)
+                               return error;
                         continue;
+               }
  
                 error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
                                 mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -1289,7 +1284,7 @@ xfs_free_file_space(
                          * Free the transaction structure.
                          */
                         ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp, 0);
+                       xfs_trans_cancel(tp);
                         break;
                 }
                 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1320,7 +1315,7 @@ xfs_free_file_space(
                         goto error0;
                 }
  
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               error = xfs_trans_commit(tp);
                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
         }
  
@@ -1330,7 +1325,7 @@ xfs_free_file_space(
   error0:
         xfs_bmap_cancel(&free_list);
   error1:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
         goto out;
  }
@@ -1462,7 +1457,7 @@ xfs_shift_file_space(
                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
                                 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
                 if (error) {
-                       xfs_trans_cancel(tp, 0);
+                       xfs_trans_cancel(tp);
                         break;
                 }
  
@@ -1492,13 +1487,13 @@ xfs_shift_file_space(
                 if (error)
                         goto out;
  
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               error = xfs_trans_commit(tp);
         }
  
         return error;
  
  out:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
         return error;
  }
  
@@ -1718,7 +1713,7 @@ xfs_swap_extents(
         tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 goto out_unlock;
         }
  
@@ -1901,7 +1896,7 @@ xfs_swap_extents(
         if (mp->m_flags & XFS_MOUNT_WSYNC)
                 xfs_trans_set_sync(tp);
  
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
  
         trace_xfs_swap_extent_after(ip, 0);
         trace_xfs_swap_extent_after(tip, 1);
@@ -1915,6 +1910,6 @@ out_unlock:
         goto out;
  
  out_trans_cancel:
-       xfs_trans_cancel(tp, 0);
+       xfs_trans_cancel(tp);
         goto out;
  }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c

index 1790b00..a4b7d92 100644 (file)
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1419,9 +1419,9 @@ xfs_buf_submit_wait(
         return error;
  }
  
-xfs_caddr_t
+void *
  xfs_buf_offset(
-       xfs_buf_t               *bp,
+       struct xfs_buf          *bp,
         size_t                  offset)
  {
         struct page             *page;
@@ -1431,7 +1431,7 @@ xfs_buf_offset(
  
         offset += bp->b_offset;
         page = bp->b_pages[offset >> PAGE_SHIFT];
-       return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
+       return page_address(page) + (offset & (PAGE_SIZE-1));
  }
  
  /*
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h

index 75ff5d5..331c1cc 100644 (file)
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -299,7 +299,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
             xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
  
  /* Buffer Utility Routines */
-extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
+extern void *xfs_buf_offset(struct xfs_buf *, size_t);
  
  /* Delayed Write Buffer Routines */
  extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c

index 02c01bb..4143dc7 100644 (file)
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -568,8 +568,6 @@ xfs_qm_dqread(
         struct xfs_buf          *bp;
         struct xfs_trans        *tp = NULL;
         int                     error;
-       int                     cancelflags = 0;
-
  
         dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
  
@@ -617,7 +615,6 @@ xfs_qm_dqread(
                                           XFS_QM_DQALLOC_SPACE_RES(mp), 0);
                 if (error)
                         goto error1;
-               cancelflags = XFS_TRANS_RELEASE_LOG_RES;
         }
  
         /*
@@ -632,7 +629,6 @@ xfs_qm_dqread(
                  * allocate (ENOENT).
                  */
                 trace_xfs_dqread_fail(dqp);
-               cancelflags |= XFS_TRANS_ABORT;
                 goto error1;
         }
  
@@ -670,7 +666,7 @@ xfs_qm_dqread(
         xfs_trans_brelse(tp, bp);
  
         if (tp) {
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               error = xfs_trans_commit(tp);
                 if (error)
                         goto error0;
         }
@@ -680,7 +676,7 @@ xfs_qm_dqread(
  
  error1:
         if (tp)
-               xfs_trans_cancel(tp, cancelflags);
+               xfs_trans_cancel(tp);
  error0:
         xfs_qm_dqdestroy(dqp);
         *O_dqpp = NULL;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c

index 338e50b..74d0e59 100644 (file)
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -127,7 +127,7 @@ xfs_error_report(
         struct xfs_mount        *mp,
         const char              *filename,
         int                     linenum,
-       inst_t                  *ra)
+       void                    *ra)
  {
         if (level <= xfs_error_level) {
                 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
@@ -146,7 +146,7 @@ xfs_corruption_error(
         void                    *p,
         const char              *filename,
         int                     linenum,
-       inst_t                  *ra)
+       void                    *ra)
  {
         if (level <= xfs_error_level)
                 xfs_hex_dump(p, 64);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h

index c0394ed..4ed3042 100644 (file)
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -21,10 +21,10 @@
  struct xfs_mount;
  
  extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
-                       const char *filename, int linenum, inst_t *ra);
+                       const char *filename, int linenum, void *ra);
  extern void xfs_corruption_error(const char *tag, int level,
                         struct xfs_mount *mp, void *p, const char *filename,
-                       int linenum, inst_t *ra);
+                       int linenum, void *ra);
  extern void xfs_verifier_error(struct xfs_buf *bp);
  
  #define        XFS_ERROR_REPORT(e, lvl, mp)    \
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c

index cb7fe64..adc8f8f 100644 (file)
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -239,7 +239,7 @@ xfs_efi_init(
  
         xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
         efip->efi_format.efi_nextents = nextents;
-       efip->efi_format.efi_id = (__psint_t)(void*)efip;
+       efip->efi_format.efi_id = (uintptr_t)(void *)efip;
         atomic_set(&efip->efi_next_extent, 0);
         atomic_set(&efip->efi_refcount, 2);
  
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 7c62fca..874507d 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -80,14 +80,15 @@ xfs_rw_ilock_demote(
  }
  
  /*
- *     xfs_iozero
+ * xfs_iozero clears the specified range supplied via the page cache (except in
+ * the DAX case). Writes through the page cache will allocate blocks over holes,
+ * though the callers usually map the holes first and avoid them. If a block is
+ * not completely zeroed, then it will be read from disk before being partially
+ * zeroed.
   *
- *     xfs_iozero clears the specified range of buffer supplied,
- *     and marks all the affected blocks as valid and modified.  If
- *     an affected block is not allocated, it will be allocated.  If
- *     an affected block is not completely overwritten, and is not
- *     valid before the operation, it will be read from disk before
- *     being partially zeroed.
+ * In the DAX case, we can just directly write to the underlying pages. This
+ * will not allocate blocks, but will avoid holes and unwritten extents and so
+ * not do unnecessary work.
   */
  int
  xfs_iozero(
@@ -97,7 +98,8 @@ xfs_iozero(
  {
         struct page             *page;
         struct address_space    *mapping;
-       int                     status;
+       int                     status = 0;
+
  
         mapping = VFS_I(ip)->i_mapping;
         do {
@@ -109,20 +111,27 @@ xfs_iozero(
                 if (bytes > count)
                         bytes = count;
  
-               status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                       AOP_FLAG_UNINTERRUPTIBLE,
-                                       &page, &fsdata);
-               if (status)
-                       break;
+               if (IS_DAX(VFS_I(ip))) {
+                       status = dax_zero_page_range(VFS_I(ip), pos, bytes,
+                                                    xfs_get_blocks_direct);
+                       if (status)
+                               break;
+               } else {
+                       status = pagecache_write_begin(NULL, mapping, pos, bytes,
+                                               AOP_FLAG_UNINTERRUPTIBLE,
+                                               &page, &fsdata);
+                       if (status)
+                               break;
  
-               zero_user(page, offset, bytes);
+                       zero_user(page, offset, bytes);
  
-               status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
-                                       page, fsdata);
-               WARN_ON(status <= 0); /* can't return less than zero! */
+                       status = pagecache_write_end(NULL, mapping, pos, bytes,
+                                               bytes, page, fsdata);
+                       WARN_ON(status <= 0); /* can't return less than zero! */
+                       status = 0;
+               }
                 pos += bytes;
                 count -= bytes;
-               status = 0;
         } while (count);
  
         return status;
@@ -139,7 +148,7 @@ xfs_update_prealloc_flags(
         tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
         error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return error;
         }
  
@@ -161,7 +170,7 @@ xfs_update_prealloc_flags(
         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
         if (flags & XFS_PREALLOC_SYNC)
                 xfs_trans_set_sync(tp);
-       return xfs_trans_commit(tp, 0);
+       return xfs_trans_commit(tp);
  }
  
  /*
@@ -285,7 +294,7 @@ xfs_file_read_iter(
         if (file->f_mode & FMODE_NOCMTIME)
                 ioflags |= XFS_IO_INVIS;
  
-       if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
+       if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
                 xfs_buftarg_t   *target =
                         XFS_IS_REALTIME_INODE(ip) ?
                                 mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -379,7 +388,11 @@ xfs_file_splice_read(
  
         trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
  
-       ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+       /* for dax, we need to avoid the page cache */
+       if (IS_DAX(VFS_I(ip)))
+               ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
+       else
+               ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
         if (ret > 0)
                 XFS_STATS_ADD(xs_read_bytes, ret);
  
@@ -673,7 +686,7 @@ xfs_file_dio_aio_write(
                                         mp->m_rtdev_targp : mp->m_ddev_targp;
  
         /* DIO must be aligned to device logical sector size */
-       if ((pos | count) & target->bt_logical_sectormask)
+       if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
                 return -EINVAL;
  
         /* "unaligned" here means not aligned to a filesystem block */
@@ -759,8 +772,11 @@ xfs_file_dio_aio_write(
  out:
         xfs_rw_iunlock(ip, iolock);
  
-       /* No fallback to buffered IO on errors for XFS. */
-       ASSERT(ret < 0 || ret == count);
+       /*
+        * No fallback to buffered IO on errors for XFS. DAX can result in
+        * partial writes, but direct IO will either complete fully or fail.
+        */
+       ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
         return ret;
  }
  
@@ -843,7 +859,7 @@ xfs_file_write_iter(
         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                 return -EIO;
  
-       if (unlikely(iocb->ki_flags & IOCB_DIRECT))
+       if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
                 ret = xfs_file_dio_aio_write(iocb, from);
         else
                 ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1064,17 +1080,6 @@ xfs_file_readdir(
         return xfs_readdir(ip, ctx, bufsize);
  }
  
-STATIC int
-xfs_file_mmap(
-       struct file     *filp,
-       struct vm_area_struct *vma)
-{
-       vma->vm_ops = &xfs_file_vm_ops;
-
-       file_accessed(filp);
-       return 0;
-}
-
  /*
   * This type is designed to indicate the type of offset we would like
   * to search from page cache for xfs_seek_hole_data().
@@ -1455,48 +1460,83 @@ xfs_file_llseek(
   * ordering of:
   *
   * mmap_sem (MM)
- *   i_mmap_lock (XFS - truncate serialisation)
- *     page_lock (MM)
- *       i_lock (XFS - extent map serialisation)
+ *   sb_start_pagefault(vfs, freeze)
+ *     i_mmap_lock (XFS - truncate serialisation)
+ *       page_lock (MM)
+ *         i_lock (XFS - extent map serialisation)
+ */
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
   */
  STATIC int
-xfs_filemap_fault(
+xfs_filemap_page_mkwrite(
         struct vm_area_struct   *vma,
         struct vm_fault         *vmf)
  {
-       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
-       int                     error;
+       struct inode            *inode = file_inode(vma->vm_file);
+       int                     ret;
  
-       trace_xfs_filemap_fault(ip);
+       trace_xfs_filemap_page_mkwrite(XFS_I(inode));
  
-       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-       error = filemap_fault(vma, vmf);
-       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+       xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
-       return error;
+       if (IS_DAX(inode)) {
+               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
+                                   xfs_end_io_dax_write);
+       } else {
+               ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+               ret = block_page_mkwrite_return(ret);
+       }
+
+       xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+       sb_end_pagefault(inode->i_sb);
+
+       return ret;
  }
  
-/*
- * mmap()d file has taken write protection fault and is being made writable. We
- * can set the page state up correctly for a writable page, which means we can
- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- * mapping.
- */
  STATIC int
-xfs_filemap_page_mkwrite(
+xfs_filemap_fault(
         struct vm_area_struct   *vma,
         struct vm_fault         *vmf)
  {
-       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
-       int                     error;
+       struct xfs_inode        *ip = XFS_I(file_inode(vma->vm_file));
+       int                     ret;
+
+       trace_xfs_filemap_fault(ip);
  
-       trace_xfs_filemap_page_mkwrite(ip);
+       /* DAX can shortcut the normal fault path on write faults! */
+       if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
+               return xfs_filemap_page_mkwrite(vma, vmf);
  
         xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-       error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+       ret = filemap_fault(vma, vmf);
         xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
  
-       return error;
+       return ret;
+}
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+       .fault          = xfs_filemap_fault,
+       .map_pages      = filemap_map_pages,
+       .page_mkwrite   = xfs_filemap_page_mkwrite,
+};
+
+STATIC int
+xfs_file_mmap(
+       struct file     *filp,
+       struct vm_area_struct *vma)
+{
+       file_accessed(filp);
+       vma->vm_ops = &xfs_file_vm_ops;
+       if (IS_DAX(file_inode(filp)))
+               vma->vm_flags |= VM_MIXEDMAP;
+       return 0;
  }
  
  const struct file_operations xfs_file_operations = {
@@ -1527,9 +1567,3 @@ const struct file_operations xfs_dir_file_operations = {
  #endif
         .fsync          = xfs_dir_fsync,
  };
-
-static const struct vm_operations_struct xfs_file_vm_ops = {
-       .fault          = xfs_filemap_fault,
-       .map_pages      = filemap_map_pages,
-       .page_mkwrite   = xfs_filemap_page_mkwrite,
-};
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c

index da82f1c..c4c130f 100644 (file)
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -196,7 +196,8 @@ xfs_filestream_pick_ag(
                         goto next_ag;
                 }
  
-               longest = xfs_alloc_longest_free_extent(mp, pag);
+               longest = xfs_alloc_longest_free_extent(mp, pag,
+                                       xfs_alloc_min_freelist(mp, pag));
                 if (((minlen && longest >= minlen) ||
                      (!minlen && pag->pagf_freeblks >= minfree)) &&
                     (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c

index cb7e8a2..9b3438a 100644 (file)
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -101,7 +101,9 @@ xfs_fs_geometry(
                         (xfs_sb_version_hasftype(&mp->m_sb) ?
                                 XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
                         (xfs_sb_version_hasfinobt(&mp->m_sb) ?
-                               XFS_FSOP_GEOM_FLAGS_FINOBT : 0);
+                               XFS_FSOP_GEOM_FLAGS_FINOBT : 0) |
+                       (xfs_sb_version_hassparseinodes(&mp->m_sb) ?
+                               XFS_FSOP_GEOM_FLAGS_SPINODES : 0);
                 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
                                 mp->m_sb.sb_logsectsize : BBSIZE;
                 geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -201,7 +203,7 @@ xfs_growfs_data_private(
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
                                   XFS_GROWFS_SPACE_RES(mp), 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return error;
         }
  
@@ -489,7 +491,7 @@ xfs_growfs_data_private(
         if (dpct)
                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
         xfs_trans_set_sync(tp);
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
         if (error)
                 return error;
  
@@ -557,7 +559,7 @@ xfs_growfs_data_private(
         return saved_error ? saved_error : error;
  
   error0:
-       xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
         return error;
  }
  
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index 539a85f..3da9f4d 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -905,7 +905,6 @@ xfs_dir_ialloc(
  
  {
         xfs_trans_t     *tp;
-       xfs_trans_t     *ntp;
         xfs_inode_t     *ip;
         xfs_buf_t       *ialloc_context = NULL;
         int             code;
@@ -954,8 +953,6 @@ xfs_dir_ialloc(
          * to succeed the second time.
          */
         if (ialloc_context) {
-               struct xfs_trans_res tres;
-
                 /*
                  * Normally, xfs_trans_commit releases all the locks.
                  * We call bhold to hang on to the ialloc_context across
@@ -964,12 +961,6 @@ xfs_dir_ialloc(
                  * allocation group.
                  */
                 xfs_trans_bhold(tp, ialloc_context);
-               /*
-                * Save the log reservation so we can use
-                * them in the next transaction.
-                */
-               tres.tr_logres = xfs_trans_get_log_res(tp);
-               tres.tr_logcount = xfs_trans_get_log_count(tp);
  
                 /*
                  * We want the quota changes to be associated with the next
@@ -985,35 +976,9 @@ xfs_dir_ialloc(
                         tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
                 }
  
-               ntp = xfs_trans_dup(tp);
-               code = xfs_trans_commit(tp, 0);
-               tp = ntp;
-               if (committed != NULL) {
+               code = xfs_trans_roll(&tp, 0);
+               if (committed != NULL)
                         *committed = 1;
-               }
-               /*
-                * If we get an error during the commit processing,
-                * release the buffer that is still held and return
-                * to the caller.
-                */
-               if (code) {
-                       xfs_buf_relse(ialloc_context);
-                       if (dqinfo) {
-                               tp->t_dqinfo = dqinfo;
-                               xfs_trans_free_dqinfo(tp);
-                       }
-                       *tpp = ntp;
-                       *ipp = NULL;
-                       return code;
-               }
-
-               /*
-                * transaction commit worked ok so we can drop the extra ticket
-                * reference that we gained in xfs_trans_dup()
-                */
-               xfs_log_ticket_put(tp->t_ticket);
-               tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-               code = xfs_trans_reserve(tp, &tres, 0, 0);
  
                 /*
                  * Re-attach the quota info that we detached from prev trx.
@@ -1025,7 +990,7 @@ xfs_dir_ialloc(
  
                 if (code) {
                         xfs_buf_relse(ialloc_context);
-                       *tpp = ntp;
+                       *tpp = tp;
                         *ipp = NULL;
                         return code;
                 }
@@ -1127,7 +1092,6 @@ xfs_create(
         xfs_bmap_free_t         free_list;
         xfs_fsblock_t           first_block;
         bool                    unlock_dp_on_error = false;
-       uint                    cancel_flags;
         int                     committed;
         prid_t                  prid;
         struct xfs_dquot        *udqp = NULL;
@@ -1164,8 +1128,6 @@ xfs_create(
                 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
         }
  
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-
         /*
          * Initially assume that the file does not exist and
          * reserve the resources for that case.  If that is not
@@ -1183,10 +1145,9 @@ xfs_create(
                 resblks = 0;
                 error = xfs_trans_reserve(tp, tres, 0, 0);
         }
-       if (error) {
-               cancel_flags = 0;
+       if (error)
                 goto out_trans_cancel;
-       }
+
  
         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
         unlock_dp_on_error = true;
@@ -1217,7 +1178,7 @@ xfs_create(
         if (error) {
                 if (error == -ENOSPC)
                         goto out_trans_cancel;
-               goto out_trans_abort;
+               goto out_trans_cancel;
         }
  
         /*
@@ -1235,7 +1196,7 @@ xfs_create(
                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
         if (error) {
                 ASSERT(error != -ENOSPC);
-               goto out_trans_abort;
+               goto out_trans_cancel;
         }
         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -1269,7 +1230,7 @@ xfs_create(
         if (error)
                 goto out_bmap_cancel;
  
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
         if (error)
                 goto out_release_inode;
  
@@ -1282,10 +1243,8 @@ xfs_create(
  
   out_bmap_cancel:
         xfs_bmap_cancel(&free_list);
- out_trans_abort:
-       cancel_flags |= XFS_TRANS_ABORT;
   out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
   out_release_inode:
         /*
          * Wait until after the current transaction is aborted to finish the
@@ -1317,7 +1276,6 @@ xfs_create_tmpfile(
         struct xfs_inode        *ip = NULL;
         struct xfs_trans        *tp = NULL;
         int                     error;
-       uint                    cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
         prid_t                  prid;
         struct xfs_dquot        *udqp = NULL;
         struct xfs_dquot        *gdqp = NULL;
@@ -1350,10 +1308,8 @@ xfs_create_tmpfile(
                 resblks = 0;
                 error = xfs_trans_reserve(tp, tres, 0, 0);
         }
-       if (error) {
-               cancel_flags = 0;
+       if (error)
                 goto out_trans_cancel;
-       }
  
         error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
                                                 pdqp, resblks, 1, 0);
@@ -1365,7 +1321,7 @@ xfs_create_tmpfile(
         if (error) {
                 if (error == -ENOSPC)
                         goto out_trans_cancel;
-               goto out_trans_abort;
+               goto out_trans_cancel;
         }
  
         if (mp->m_flags & XFS_MOUNT_WSYNC)
@@ -1381,9 +1337,9 @@ xfs_create_tmpfile(
         ip->i_d.di_nlink--;
         error = xfs_iunlink(tp, ip);
         if (error)
-               goto out_trans_abort;
+               goto out_trans_cancel;
  
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
         if (error)
                 goto out_release_inode;
  
@@ -1394,10 +1350,8 @@ xfs_create_tmpfile(
         *ipp = ip;
         return 0;
  
- out_trans_abort:
-       cancel_flags |= XFS_TRANS_ABORT;
   out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
   out_release_inode:
         /*
          * Wait until after the current transaction is aborted to finish the
@@ -1427,7 +1381,6 @@ xfs_link(
         int                     error;
         xfs_bmap_free_t         free_list;
         xfs_fsblock_t           first_block;
-       int                     cancel_flags;
         int                     committed;
         int                     resblks;
  
@@ -1447,17 +1400,14 @@ xfs_link(
                 goto std_return;
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
         resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
         if (error == -ENOSPC) {
                 resblks = 0;
                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
         }
-       if (error) {
-               cancel_flags = 0;
+       if (error)
                 goto error_return;
-       }
  
         xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
  
@@ -1486,19 +1436,19 @@ xfs_link(
         if (sip->i_d.di_nlink == 0) {
                 error = xfs_iunlink_remove(tp, sip);
                 if (error)
-                       goto abort_return;
+                       goto error_return;
         }
  
         error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
                                         &first_block, &free_list, resblks);
         if (error)
-               goto abort_return;
+               goto error_return;
         xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
  
         error = xfs_bumplink(tp, sip);
         if (error)
-               goto abort_return;
+               goto error_return;
  
         /*
          * If this is a synchronous mount, make sure that the
@@ -1512,15 +1462,13 @@ xfs_link(
         error = xfs_bmap_finish (&tp, &free_list, &committed);
         if (error) {
                 xfs_bmap_cancel(&free_list);
-               goto abort_return;
+               goto error_return;
         }
  
-       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       return xfs_trans_commit(tp);
  
- abort_return:
-       cancel_flags |= XFS_TRANS_ABORT;
   error_return:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
   std_return:
         return error;
  }
@@ -1555,7 +1503,6 @@ xfs_itruncate_extents(
  {
         struct xfs_mount        *mp = ip->i_mount;
         struct xfs_trans        *tp = *tpp;
-       struct xfs_trans        *ntp;
         xfs_bmap_free_t         free_list;
         xfs_fsblock_t           first_block;
         xfs_fileoff_t           first_unmap_block;
@@ -1613,29 +1560,7 @@ xfs_itruncate_extents(
                 if (error)
                         goto out_bmap_cancel;
  
-               if (committed) {
-                       /*
-                        * Mark the inode dirty so it will be logged and
-                        * moved forward in the log as part of every commit.
-                        */
-                       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-               }
-
-               ntp = xfs_trans_dup(tp);
-               error = xfs_trans_commit(tp, 0);
-               tp = ntp;
-
-               xfs_trans_ijoin(tp, ip, 0);
-
-               if (error)
-                       goto out;
-
-               /*
-                * Transaction commit worked ok so we can drop the extra ticket
-                * reference that we gained in xfs_trans_dup()
-                */
-               xfs_log_ticket_put(tp->t_ticket);
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+               error = xfs_trans_roll(&tp, ip);
                 if (error)
                         goto out;
         }
@@ -1756,7 +1681,7 @@ xfs_inactive_truncate(
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
         if (error) {
                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return error;
         }
  
@@ -1777,7 +1702,7 @@ xfs_inactive_truncate(
  
         ASSERT(ip->i_d.di_nextents == 0);
  
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
         if (error)
                 goto error_unlock;
  
@@ -1785,7 +1710,7 @@ xfs_inactive_truncate(
         return 0;
  
  error_trans_cancel:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
  error_unlock:
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return error;
@@ -1835,7 +1760,7 @@ xfs_inactive_ifree(
                 } else {
                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
                 }
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+               xfs_trans_cancel(tp);
                 return error;
         }
  
@@ -1855,7 +1780,7 @@ xfs_inactive_ifree(
                                 __func__, error);
                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
                 }
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+               xfs_trans_cancel(tp);
                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
                 return error;
         }
@@ -1874,7 +1799,7 @@ xfs_inactive_ifree(
         if (error)
                 xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
                         __func__, error);
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
         if (error)
                 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
                         __func__, error);
@@ -2235,28 +2160,42 @@ xfs_iunlink_remove(
   */
  STATIC int
  xfs_ifree_cluster(
-       xfs_inode_t     *free_ip,
-       xfs_trans_t     *tp,
-       xfs_ino_t       inum)
+       xfs_inode_t             *free_ip,
+       xfs_trans_t             *tp,
+       struct xfs_icluster     *xic)
  {
         xfs_mount_t             *mp = free_ip->i_mount;
         int                     blks_per_cluster;
         int                     inodes_per_cluster;
         int                     nbufs;
         int                     i, j;
+       int                     ioffset;
         xfs_daddr_t             blkno;
         xfs_buf_t               *bp;
         xfs_inode_t             *ip;
         xfs_inode_log_item_t    *iip;
         xfs_log_item_t          *lip;
         struct xfs_perag        *pag;
+       xfs_ino_t               inum;
  
+       inum = xic->first_ino;
         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
         blks_per_cluster = xfs_icluster_size_fsb(mp);
         inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
         nbufs = mp->m_ialloc_blks / blks_per_cluster;
  
         for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
+               /*
+                * The allocation bitmap tells us which inodes of the chunk were
+                * physically allocated. Skip the cluster if an inode falls into
+                * a sparse region.
+                */
+               ioffset = inum - xic->first_ino;
+               if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
+                       ASSERT(do_mod(ioffset, inodes_per_cluster) == 0);
+                       continue;
+               }
+
                 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
                                          XFS_INO_TO_AGBNO(mp, inum));
  
@@ -2414,8 +2353,7 @@ xfs_ifree(
         xfs_bmap_free_t *flist)
  {
         int                     error;
-       int                     delete;
-       xfs_ino_t               first_ino;
+       struct xfs_icluster     xic = { 0 };
  
         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
         ASSERT(ip->i_d.di_nlink == 0);
@@ -2431,7 +2369,7 @@ xfs_ifree(
         if (error)
                 return error;
  
-       error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
+       error = xfs_difree(tp, ip->i_ino, flist, &xic);
         if (error)
                 return error;
  
@@ -2448,8 +2386,8 @@ xfs_ifree(
         ip->i_d.di_gen++;
         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  
-       if (delete)
-               error = xfs_ifree_cluster(ip, tp, first_ino);
+       if (xic.deleted)
+               error = xfs_ifree_cluster(ip, tp, &xic);
  
         return error;
  }
@@ -2536,7 +2474,6 @@ xfs_remove(
         int                     error = 0;
         xfs_bmap_free_t         free_list;
         xfs_fsblock_t           first_block;
-       int                     cancel_flags;
         int                     committed;
         uint                    resblks;
  
@@ -2557,7 +2494,6 @@ xfs_remove(
                 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
         else
                 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
  
         /*
          * We try to get the real space reservation first,
@@ -2576,7 +2512,6 @@ xfs_remove(
         }
         if (error) {
                 ASSERT(error != -ENOSPC);
-               cancel_flags = 0;
                 goto out_trans_cancel;
         }
  
@@ -2588,7 +2523,6 @@ xfs_remove(
         /*
          * If we're removing a directory perform some additional validation.
          */
-       cancel_flags |= XFS_TRANS_ABORT;
         if (is_dir) {
                 ASSERT(ip->i_d.di_nlink >= 2);
                 if (ip->i_d.di_nlink != 2) {
@@ -2644,7 +2578,7 @@ xfs_remove(
         if (error)
                 goto out_bmap_cancel;
  
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
         if (error)
                 goto std_return;
  
@@ -2656,7 +2590,7 @@ xfs_remove(
   out_bmap_cancel:
         xfs_bmap_cancel(&free_list);
   out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
   std_return:
         return error;
  }
@@ -2730,11 +2664,11 @@ xfs_finish_rename(
         error = xfs_bmap_finish(&tp, free_list, &committed);
         if (error) {
                 xfs_bmap_cancel(free_list);
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+               xfs_trans_cancel(tp);
                 return error;
         }
  
-       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       return xfs_trans_commit(tp);
  }
  
  /*
@@ -2855,7 +2789,7 @@ xfs_cross_rename(
  
  out_trans_abort:
         xfs_bmap_cancel(free_list);
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
         return error;
  }
  
@@ -2915,7 +2849,6 @@ xfs_rename(
         int                     num_inodes = __XFS_SORT_INODES;
         bool                    new_parent = (src_dp != target_dp);
         bool                    src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
-       int                     cancel_flags = 0;
         int                     spaceres;
         int                     error;
  
@@ -2951,7 +2884,6 @@ xfs_rename(
         }
         if (error)
                 goto out_trans_cancel;
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
  
         /*
          * Attach the dquots to the inodes
@@ -3022,10 +2954,8 @@ xfs_rename(
                 error = xfs_dir_createname(tp, target_dp, target_name,
                                                 src_ip->i_ino, &first_block,
                                                 &free_list, spaceres);
-               if (error == -ENOSPC)
-                       goto out_bmap_cancel;
                 if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
  
                 xfs_trans_ichgtime(tp, target_dp,
                                         XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3033,7 +2963,7 @@ xfs_rename(
                 if (new_parent && src_is_directory) {
                         error = xfs_bumplink(tp, target_dp);
                         if (error)
-                               goto out_trans_abort;
+                               goto out_bmap_cancel;
                 }
         } else { /* target_ip != NULL */
                 /*
@@ -3065,7 +2995,7 @@ xfs_rename(
                                         src_ip->i_ino,
                                         &first_block, &free_list, spaceres);
                 if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
  
                 xfs_trans_ichgtime(tp, target_dp,
                                         XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3076,7 +3006,7 @@ xfs_rename(
                  */
                 error = xfs_droplink(tp, target_ip);
                 if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
  
                 if (src_is_directory) {
                         /*
@@ -3084,7 +3014,7 @@ xfs_rename(
                          */
                         error = xfs_droplink(tp, target_ip);
                         if (error)
-                               goto out_trans_abort;
+                               goto out_bmap_cancel;
                 }
         } /* target_ip != NULL */
  
@@ -3101,7 +3031,7 @@ xfs_rename(
                                         &first_block, &free_list, spaceres);
                 ASSERT(error != -EEXIST);
                 if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
         }
  
         /*
@@ -3127,7 +3057,7 @@ xfs_rename(
                  */
                 error = xfs_droplink(tp, src_dp);
                 if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
         }
  
         /*
@@ -3142,7 +3072,7 @@ xfs_rename(
                 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
                                            &first_block, &free_list, spaceres);
         if (error)
-               goto out_trans_abort;
+               goto out_bmap_cancel;
  
         /*
          * For whiteouts, we need to bump the link count on the whiteout inode.
@@ -3156,10 +3086,10 @@ xfs_rename(
                 ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0);
                 error = xfs_bumplink(tp, wip);
                 if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
                 error = xfs_iunlink_remove(tp, wip);
                 if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
                 xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
  
                 /*
@@ -3180,12 +3110,10 @@ xfs_rename(
                 IRELE(wip);
         return error;
  
-out_trans_abort:
-       cancel_flags |= XFS_TRANS_ABORT;
  out_bmap_cancel:
         xfs_bmap_cancel(&free_list);
  out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
         if (wip)
                 IRELE(wip);
         return error;
@@ -3464,7 +3392,7 @@ xfs_iflush_int(
         ASSERT(ip->i_d.di_version > 1);
  
         /* set *dip = inode's place in the buffer */
-       dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
+       dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
  
         if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
                                mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c

index 87f67c6..ea7d85a 100644 (file)
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -336,7 +336,7 @@ xfs_set_dmattrs(
         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return error;
         }
         xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -346,7 +346,7 @@ xfs_set_dmattrs(
         ip->i_d.di_dmstate  = state;
  
         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
  
         return error;
  }
@@ -1076,7 +1076,7 @@ xfs_ioctl_setattr_get_trans(
         return tp;
  
  out_cancel:
-       xfs_trans_cancel(tp, 0);
+       xfs_trans_cancel(tp);
         return ERR_PTR(error);
  }
  
@@ -1253,7 +1253,7 @@ xfs_ioctl_setattr(
         else
                 ip->i_d.di_extsize = 0;
  
-       code = xfs_trans_commit(tp, 0);
+       code = xfs_trans_commit(tp);
  
         /*
          * Release any dquot(s) the inode had kept before chown.
@@ -1265,7 +1265,7 @@ xfs_ioctl_setattr(
         return code;
  
  error_trans_cancel:
-       xfs_trans_cancel(tp, 0);
+       xfs_trans_cancel(tp);
  error_free_dquots:
         xfs_qm_dqrele(udqp);
         xfs_qm_dqrele(pdqp);
@@ -1338,11 +1338,11 @@ xfs_ioc_setxflags(
  
         error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 goto out_drop_write;
         }
  
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
  out_drop_write:
         mnt_drop_write_file(filp);
         return error;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index 38e633b..1f86033 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -183,7 +183,7 @@ xfs_iomap_write_direct(
          * Check for running out of space, note: need lock to return
          */
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return error;
         }
  
@@ -213,7 +213,7 @@ xfs_iomap_write_direct(
         error = xfs_bmap_finish(&tp, &free_list, &committed);
         if (error)
                 goto out_bmap_cancel;
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
         if (error)
                 goto out_unlock;
  
@@ -236,7 +236,7 @@ out_bmap_cancel:
         xfs_bmap_cancel(&free_list);
         xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
  out_trans_cancel:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
         goto out_unlock;
  }
  
@@ -690,7 +690,7 @@ xfs_iomap_write_allocate(
                         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
                                                   nres, 0);
                         if (error) {
-                               xfs_trans_cancel(tp, 0);
+                               xfs_trans_cancel(tp);
                                 return error;
                         }
                         xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -760,7 +760,7 @@ xfs_iomap_write_allocate(
                         if (error)
                                 goto trans_cancel;
  
-                       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+                       error = xfs_trans_commit(tp);
                         if (error)
                                 goto error0;
  
@@ -791,7 +791,7 @@ xfs_iomap_write_allocate(
  
  trans_cancel:
         xfs_bmap_cancel(&free_list);
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
  error0:
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return error;
@@ -853,7 +853,7 @@ xfs_iomap_write_unwritten(
                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
                                           resblks, 0);
                 if (error) {
-                       xfs_trans_cancel(tp, 0);
+                       xfs_trans_cancel(tp);
                         return error;
                 }
  
@@ -890,7 +890,7 @@ xfs_iomap_write_unwritten(
                 if (error)
                         goto error_on_bmapi_transaction;
  
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               error = xfs_trans_commit(tp);
                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
                 if (error)
                         return error;
@@ -914,7 +914,7 @@ xfs_iomap_write_unwritten(
  
  error_on_bmapi_transaction:
         xfs_bmap_cancel(&free_list);
-       xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
+       xfs_trans_cancel(tp);
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return error;
  }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c

index 7f51f39..766b23f 100644 (file)
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -699,7 +699,7 @@ xfs_setattr_nonsize(
  
         if (mp->m_flags & XFS_MOUNT_WSYNC)
                 xfs_trans_set_sync(tp);
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
  
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
  
@@ -730,7 +730,7 @@ xfs_setattr_nonsize(
         return 0;
  
  out_trans_cancel:
-       xfs_trans_cancel(tp, 0);
+       xfs_trans_cancel(tp);
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
  out_dqrele:
         xfs_qm_dqrele(udqp);
@@ -752,7 +752,6 @@ xfs_setattr_size(
         struct xfs_trans        *tp;
         int                     error;
         uint                    lock_flags = 0;
-       uint                    commit_flags = 0;
         bool                    did_zeroing = false;
  
         trace_xfs_setattr(ip);
@@ -848,7 +847,11 @@ xfs_setattr_size(
          * to hope that the caller sees ENOMEM and retries the truncate
          * operation.
          */
-       error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
+       if (IS_DAX(inode))
+               error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
+       else
+               error = block_truncate_page(inode->i_mapping, newsize,
+                                           xfs_get_blocks);
         if (error)
                 return error;
         truncate_setsize(inode, newsize);
@@ -858,7 +861,6 @@ xfs_setattr_size(
         if (error)
                 goto out_trans_cancel;
  
-       commit_flags = XFS_TRANS_RELEASE_LOG_RES;
         lock_flags |= XFS_ILOCK_EXCL;
         xfs_ilock(ip, XFS_ILOCK_EXCL);
         xfs_trans_ijoin(tp, ip, 0);
@@ -898,7 +900,7 @@ xfs_setattr_size(
         if (newsize <= oldsize) {
                 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
                 if (error)
-                       goto out_trans_abort;
+                       goto out_trans_cancel;
  
                 /*
                  * Truncated "down", so we're removing references to old data
@@ -925,16 +927,14 @@ xfs_setattr_size(
         if (mp->m_flags & XFS_MOUNT_WSYNC)
                 xfs_trans_set_sync(tp);
  
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
  out_unlock:
         if (lock_flags)
                 xfs_iunlock(ip, lock_flags);
         return error;
  
-out_trans_abort:
-       commit_flags |= XFS_TRANS_ABORT;
  out_trans_cancel:
-       xfs_trans_cancel(tp, commit_flags);
+       xfs_trans_cancel(tp);
         goto out_unlock;
  }
  
@@ -981,7 +981,7 @@ xfs_vn_update_time(
         tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return error;
         }
  
@@ -1003,7 +1003,7 @@ xfs_vn_update_time(
         }
         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
         xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
-       return xfs_trans_commit(tp, 0);
+       return xfs_trans_commit(tp);
  }
  
  #define XFS_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -1188,22 +1188,22 @@ xfs_diflags_to_iflags(
         struct inode            *inode,
         struct xfs_inode        *ip)
  {
-       if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+       uint16_t                flags = ip->i_d.di_flags;
+
+       inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
+                           S_NOATIME | S_DAX);
+
+       if (flags & XFS_DIFLAG_IMMUTABLE)
                 inode->i_flags |= S_IMMUTABLE;
-       else
-               inode->i_flags &= ~S_IMMUTABLE;
-       if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+       if (flags & XFS_DIFLAG_APPEND)
                 inode->i_flags |= S_APPEND;
-       else
-               inode->i_flags &= ~S_APPEND;
-       if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+       if (flags & XFS_DIFLAG_SYNC)
                 inode->i_flags |= S_SYNC;
-       else
-               inode->i_flags &= ~S_SYNC;
-       if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+       if (flags & XFS_DIFLAG_NOATIME)
                 inode->i_flags |= S_NOATIME;
-       else
-               inode->i_flags &= ~S_NOATIME;
+       /* XXX: Also needs an on-disk per inode flag! */
+       if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
+               inode->i_flags |= S_DAX;
  }
  
  /*
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c

index 8042989..f41b0c3 100644 (file)
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -252,7 +252,7 @@ xfs_bulkstat_grab_ichunk(
                 }
  
                 irec->ir_free |= xfs_inobt_maskn(0, idx);
-               *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount;
+               *icount = irec->ir_count - irec->ir_freecount;
         }
  
         return 0;
@@ -415,6 +415,8 @@ xfs_bulkstat(
                                 goto del_cursor;
                         if (icount) {
                                 irbp->ir_startino = r.ir_startino;
+                               irbp->ir_holemask = r.ir_holemask;
+                               irbp->ir_count = r.ir_count;
                                 irbp->ir_freecount = r.ir_freecount;
                                 irbp->ir_free = r.ir_free;
                                 irbp++;
@@ -447,13 +449,15 @@ xfs_bulkstat(
                          * If this chunk has any allocated inodes, save it.
                          * Also start read-ahead now for this chunk.
                          */
-                       if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+                       if (r.ir_freecount < r.ir_count) {
                                 xfs_bulkstat_ichunk_ra(mp, agno, &r);
                                 irbp->ir_startino = r.ir_startino;
+                               irbp->ir_holemask = r.ir_holemask;
+                               irbp->ir_count = r.ir_count;
                                 irbp->ir_freecount = r.ir_freecount;
                                 irbp->ir_free = r.ir_free;
                                 irbp++;
-                               icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
+                               icount += r.ir_count - r.ir_freecount;
                         }
                         error = xfs_btree_increment(cur, 0, &stat);
                         if (error || stat == 0) {
@@ -599,8 +603,7 @@ xfs_inumbers(
                 agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
                 buffer[bufidx].xi_startino =
                         XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
-               buffer[bufidx].xi_alloccount =
-                       XFS_INODES_PER_CHUNK - r.ir_freecount;
+               buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount;
                 buffer[bufidx].xi_allocmask = ~r.ir_free;
                 if (++bufidx == bcount) {
                         long    written;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h

index 7c7842c..85f883d 100644 (file)
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -32,26 +32,12 @@ typedef unsigned int                __uint32_t;
  typedef signed long long int   __int64_t;
  typedef unsigned long long int __uint64_t;
  
-typedef __uint32_t             inst_t;         /* an instruction */
-
  typedef __s64                  xfs_off_t;      /* <file offset> type */
  typedef unsigned long long     xfs_ino_t;      /* <inode> type */
  typedef __s64                  xfs_daddr_t;    /* <disk address> type */
-typedef char *                 xfs_caddr_t;    /* <core address> type */
  typedef __u32                  xfs_dev_t;
  typedef __u32                  xfs_nlink_t;
  
-/* __psint_t is the same size as a pointer */
-#if (BITS_PER_LONG == 32)
-typedef __int32_t __psint_t;
-typedef __uint32_t __psunsigned_t;
-#elif (BITS_PER_LONG == 64)
-typedef __int64_t __psint_t;
-typedef __uint64_t __psunsigned_t;
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-
  #include "xfs_types.h"
  
  #include "kmem.h"
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c

index bcc7cfa..08d4fe4 100644 (file)
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -109,7 +109,7 @@ xlog_ungrant_log_space(
  STATIC void
  xlog_verify_dest_ptr(
         struct xlog             *log,
-       char                    *ptr);
+       void                    *ptr);
  STATIC void
  xlog_verify_grant_tail(
         struct xlog *log);
@@ -513,7 +513,7 @@ xfs_log_done(
         struct xfs_mount        *mp,
         struct xlog_ticket      *ticket,
         struct xlog_in_core     **iclog,
-       uint                    flags)
+       bool                    regrant)
  {
         struct xlog             *log = mp->m_log;
         xfs_lsn_t               lsn = 0;
@@ -526,14 +526,11 @@ xfs_log_done(
             (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
              (xlog_commit_record(log, ticket, iclog, &lsn)))) {
                 lsn = (xfs_lsn_t) -1;
-               if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
-                       flags |= XFS_LOG_REL_PERM_RESERV;
-               }
+               regrant = false;
         }
  
  
-       if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
-           (flags & XFS_LOG_REL_PERM_RESERV)) {
+       if (!regrant) {
                 trace_xfs_log_done_nonperm(log, ticket);
  
                 /*
@@ -541,7 +538,6 @@ xfs_log_done(
                  * request has been made to release a permanent reservation.
                  */
                 xlog_ungrant_log_space(log, ticket);
-               xfs_log_ticket_put(ticket);
         } else {
                 trace_xfs_log_done_perm(log, ticket);
  
@@ -553,6 +549,7 @@ xfs_log_done(
                 ticket->t_flags |= XLOG_TIC_INITED;
         }
  
+       xfs_log_ticket_put(ticket);
         return lsn;
  }
  
@@ -1447,7 +1444,7 @@ xlog_alloc_log(
                 iclog->ic_bp = bp;
                 iclog->ic_data = bp->b_addr;
  #ifdef DEBUG
-               log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+               log->l_iclog_bak[i] = &iclog->ic_header;
  #endif
                 head = &iclog->ic_header;
                 memset(head, 0, sizeof(xlog_rec_header_t));
@@ -1602,7 +1599,7 @@ xlog_pack_data(
         int                     i, j, k;
         int                     size = iclog->ic_offset + roundoff;
         __be32                  cycle_lsn;
-       xfs_caddr_t             dp;
+       char                    *dp;
  
         cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
  
@@ -3664,7 +3661,7 @@ xlog_ticket_alloc(
  void
  xlog_verify_dest_ptr(
         struct xlog     *log,
-       char            *ptr)
+       void            *ptr)
  {
         int i;
         int good_ptr = 0;
@@ -3767,9 +3764,8 @@ xlog_verify_iclog(
         xlog_op_header_t        *ophead;
         xlog_in_core_t          *icptr;
         xlog_in_core_2_t        *xhdr;
-       xfs_caddr_t             ptr;
-       xfs_caddr_t             base_ptr;
-       __psint_t               field_offset;
+       void                    *base_ptr, *ptr, *p;
+       ptrdiff_t               field_offset;
         __uint8_t               clientid;
         int                     len, i, j, k, op_len;
         int                     idx;
@@ -3788,9 +3784,9 @@ xlog_verify_iclog(
         if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
                 xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
  
-       ptr = (xfs_caddr_t) &iclog->ic_header;
-       for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
-            ptr += BBSIZE) {
+       base_ptr = ptr = &iclog->ic_header;
+       p = &iclog->ic_header;
+       for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) {
                 if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
                         xfs_emerg(log->l_mp, "%s: unexpected magic num",
                                 __func__);
@@ -3798,20 +3794,19 @@ xlog_verify_iclog(
  
         /* check fields */
         len = be32_to_cpu(iclog->ic_header.h_num_logops);
-       ptr = iclog->ic_datap;
-       base_ptr = ptr;
-       ophead = (xlog_op_header_t *)ptr;
+       base_ptr = ptr = iclog->ic_datap;
+       ophead = ptr;
         xhdr = iclog->ic_data;
         for (i = 0; i < len; i++) {
-               ophead = (xlog_op_header_t *)ptr;
+               ophead = ptr;
  
                 /* clientid is only 1 byte */
-               field_offset = (__psint_t)
-                              ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
+               p = &ophead->oh_clientid;
+               field_offset = p - base_ptr;
                 if (!syncing || (field_offset & 0x1ff)) {
                         clientid = ophead->oh_clientid;
                 } else {
-                       idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
+                       idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
                         if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
                                 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                                 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3829,13 +3824,13 @@ xlog_verify_iclog(
                                 (unsigned long)field_offset);
  
                 /* check length */
-               field_offset = (__psint_t)
-                              ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
+               p = &ophead->oh_len;
+               field_offset = p - base_ptr;
                 if (!syncing || (field_offset & 0x1ff)) {
                         op_len = be32_to_cpu(ophead->oh_len);
                 } else {
-                       idx = BTOBBT((__psint_t)&ophead->oh_len -
-                                   (__psint_t)iclog->ic_datap);
+                       idx = BTOBBT((uintptr_t)&ophead->oh_len -
+                                   (uintptr_t)iclog->ic_datap);
                         if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
                                 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                                 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h

index 84e0deb..fa27aae 100644 (file)
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -110,15 +110,6 @@ static inline xfs_lsn_t    _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
  
  #define        XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
  
-/*
- * Macros, structures, prototypes for interface to the log manager.
- */
-
-/*
- * Flags to xfs_log_done()
- */
-#define XFS_LOG_REL_PERM_RESERV        0x1
-
  /*
   * Flags to xfs_log_force()
   *
@@ -138,7 +129,7 @@ struct xfs_log_callback;
  xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
                        struct xlog_ticket *ticket,
                        struct xlog_in_core **iclog,
-                      uint             flags);
+                      bool regrant);
  int      _xfs_log_force(struct xfs_mount *mp,
                          uint           flags,
                          int            *log_forced);
@@ -183,7 +174,7 @@ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
  void     xfs_log_ticket_put(struct xlog_ticket *ticket);
  
  void   xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
-                               xfs_lsn_t *commit_lsn, int flags);
+                               xfs_lsn_t *commit_lsn, bool regrant);
  bool   xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
  
  void   xfs_log_work_queue(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c

index 45cc0ce..abc2ccb 100644 (file)
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -624,7 +624,7 @@ restart:
         spin_unlock(&cil->xc_push_lock);
  
         /* xfs_log_done always frees the ticket on error. */
-       commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+       commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
         if (commit_lsn == -1)
                 goto out_abort;
  
@@ -773,14 +773,10 @@ xfs_log_commit_cil(
         struct xfs_mount        *mp,
         struct xfs_trans        *tp,
         xfs_lsn_t               *commit_lsn,
-       int                     flags)
+       bool                    regrant)
  {
         struct xlog             *log = mp->m_log;
         struct xfs_cil          *cil = log->l_cilp;
-       int                     log_flags = 0;
-
-       if (flags & XFS_TRANS_RELEASE_LOG_RES)
-               log_flags = XFS_LOG_REL_PERM_RESERV;
  
         /* lock out background commit */
         down_read(&cil->xc_ctx_lock);
@@ -795,7 +791,7 @@ xfs_log_commit_cil(
         if (commit_lsn)
                 *commit_lsn = tp->t_commit_lsn;
  
-       xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+       xfs_log_done(mp, tp->t_ticket, NULL, regrant);
         xfs_trans_unreserve_and_mod_sb(tp);
  
         /*
@@ -809,7 +805,7 @@ xfs_log_commit_cil(
          * the log items. This affects (at least) processing of stale buffers,
          * inodes and EFIs.
          */
-       xfs_trans_free_items(tp, tp->t_commit_lsn, 0);
+       xfs_trans_free_items(tp, tp->t_commit_lsn, false);
  
         xlog_cil_push_background(log);
  
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h

index db7cbde..1c87c8a 100644 (file)
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -409,7 +409,7 @@ struct xlog {
  
         /* The following field are used for debugging; need to hold icloglock */
  #ifdef DEBUG
-       char                    *l_iclog_bak[XLOG_MAX_ICLOGS];
+       void                    *l_iclog_bak[XLOG_MAX_ICLOGS];
  #endif
  
  };
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c

index 4f5784f..01dd228 100644 (file)
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -147,7 +147,7 @@ xlog_put_bp(
   * Return the address of the start of the given block number's data
   * in a log buffer.  The buffer covers a log sector-aligned region.
   */
-STATIC xfs_caddr_t
+STATIC char *
  xlog_align(
         struct xlog     *log,
         xfs_daddr_t     blk_no,
@@ -203,7 +203,7 @@ xlog_bread(
         xfs_daddr_t     blk_no,
         int             nbblks,
         struct xfs_buf  *bp,
-       xfs_caddr_t     *offset)
+       char            **offset)
  {
         int             error;
  
@@ -225,9 +225,9 @@ xlog_bread_offset(
         xfs_daddr_t     blk_no,         /* block to read from */
         int             nbblks,         /* blocks to read */
         struct xfs_buf  *bp,
-       xfs_caddr_t     offset)
+       char            *offset)
  {
-       xfs_caddr_t     orig_offset = bp->b_addr;
+       char            *orig_offset = bp->b_addr;
         int             orig_len = BBTOB(bp->b_length);
         int             error, error2;
  
@@ -396,7 +396,7 @@ xlog_find_cycle_start(
         xfs_daddr_t     *last_blk,
         uint            cycle)
  {
-       xfs_caddr_t     offset;
+       char            *offset;
         xfs_daddr_t     mid_blk;
         xfs_daddr_t     end_blk;
         uint            mid_cycle;
@@ -443,7 +443,7 @@ xlog_find_verify_cycle(
         uint            cycle;
         xfs_buf_t       *bp;
         xfs_daddr_t     bufblks;
-       xfs_caddr_t     buf = NULL;
+       char            *buf = NULL;
         int             error = 0;
  
         /*
@@ -509,7 +509,7 @@ xlog_find_verify_log_record(
  {
         xfs_daddr_t             i;
         xfs_buf_t               *bp;
-       xfs_caddr_t             offset = NULL;
+       char                    *offset = NULL;
         xlog_rec_header_t       *head = NULL;
         int                     error = 0;
         int                     smallmem = 0;
@@ -616,7 +616,7 @@ xlog_find_head(
         xfs_daddr_t     *return_head_blk)
  {
         xfs_buf_t       *bp;
-       xfs_caddr_t     offset;
+       char            *offset;
         xfs_daddr_t     new_blk, first_blk, start_blk, last_blk, head_blk;
         int             num_scan_bblks;
         uint            first_half_cycle, last_half_cycle;
@@ -891,7 +891,7 @@ xlog_find_tail(
  {
         xlog_rec_header_t       *rhead;
         xlog_op_header_t        *op_head;
-       xfs_caddr_t             offset = NULL;
+       char                    *offset = NULL;
         xfs_buf_t               *bp;
         int                     error, i, found;
         xfs_daddr_t             umount_data_blk;
@@ -1099,7 +1099,7 @@ xlog_find_zeroed(
         xfs_daddr_t     *blk_no)
  {
         xfs_buf_t       *bp;
-       xfs_caddr_t     offset;
+       char            *offset;
         uint            first_cycle, last_cycle;
         xfs_daddr_t     new_blk, last_blk, start_blk;
         xfs_daddr_t     num_scan_bblks;
@@ -1199,7 +1199,7 @@ bp_err:
  STATIC void
  xlog_add_record(
         struct xlog             *log,
-       xfs_caddr_t             buf,
+       char                    *buf,
         int                     cycle,
         int                     block,
         int                     tail_cycle,
@@ -1227,7 +1227,7 @@ xlog_write_log_records(
         int             tail_cycle,
         int             tail_block)
  {
-       xfs_caddr_t     offset;
+       char            *offset;
         xfs_buf_t       *bp;
         int             balign, ealign;
         int             sectbb = log->l_sectBBsize;
@@ -1789,8 +1789,7 @@ xlog_recover_do_inode_buffer(
                         return -EFSCORRUPTED;
                 }
  
-               buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
-                                             next_unlinked_offset);
+               buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
                 *buffer_nextp = *logged_nextp;
  
                 /*
@@ -1798,7 +1797,7 @@ xlog_recover_do_inode_buffer(
                  * have to leave the inode in a consistent state for whoever
                  * reads it next....
                  */
-               xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+               xfs_dinode_calc_crc(mp,
                                 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
  
         }
@@ -2503,8 +2502,8 @@ xlog_recover_inode_pass2(
         xfs_buf_t               *bp;
         xfs_dinode_t            *dip;
         int                     len;
-       xfs_caddr_t             src;
-       xfs_caddr_t             dest;
+       char                    *src;
+       char                    *dest;
         int                     error;
         int                     attr_index;
         uint                    fields;
@@ -2546,7 +2545,7 @@ xlog_recover_inode_pass2(
                 goto out_release;
         }
         ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
-       dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
+       dip = xfs_buf_offset(bp, in_f->ilf_boffset);
  
         /*
          * Make sure the place we're flushing out to really looks
@@ -2885,7 +2884,7 @@ xlog_recover_dquot_pass2(
                 return error;
  
         ASSERT(bp);
-       ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
+       ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
  
         /*
          * If the dquot has an LSN in it, recover the dquot only if it's less
@@ -3068,12 +3067,22 @@ xlog_recover_do_icreate_pass2(
                 return -EINVAL;
         }
  
-       /* existing allocation is fixed value */
-       ASSERT(count == mp->m_ialloc_inos);
-       ASSERT(length == mp->m_ialloc_blks);
-       if (count != mp->m_ialloc_inos ||
-            length != mp->m_ialloc_blks) {
-               xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+       /*
+        * The inode chunk is either full or sparse and we only support
+        * m_ialloc_min_blks sized sparse allocations at this time.
+        */
+       if (length != mp->m_ialloc_blks &&
+           length != mp->m_ialloc_min_blks) {
+               xfs_warn(log->l_mp,
+                        "%s: unsupported chunk length", __FUNCTION__);
+               return -EINVAL;
+       }
+
+       /* verify inode count is consistent with extent length */
+       if ((count >> mp->m_sb.sb_inopblog) != length) {
+               xfs_warn(log->l_mp,
+                        "%s: inconsistent inode count and chunk length",
+                        __FUNCTION__);
                 return -EINVAL;
         }
  
@@ -3091,8 +3100,8 @@ xlog_recover_do_icreate_pass2(
                         XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
                 return 0;
  
-       xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
-                                       be32_to_cpu(icl->icl_gen));
+       xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length,
+                             be32_to_cpu(icl->icl_gen));
         return 0;
  }
  
@@ -3364,17 +3373,17 @@ STATIC int
  xlog_recover_add_to_cont_trans(
         struct xlog             *log,
         struct xlog_recover     *trans,
-       xfs_caddr_t             dp,
+       char                    *dp,
         int                     len)
  {
         xlog_recover_item_t     *item;
-       xfs_caddr_t             ptr, old_ptr;
+       char                    *ptr, *old_ptr;
         int                     old_len;
  
         if (list_empty(&trans->r_itemq)) {
                 /* finish copying rest of trans header */
                 xlog_recover_add_item(&trans->r_itemq);
-               ptr = (xfs_caddr_t) &trans->r_theader +
+               ptr = (char *)&trans->r_theader +
                                 sizeof(xfs_trans_header_t) - len;
                 memcpy(ptr, dp, len);
                 return 0;
@@ -3410,12 +3419,12 @@ STATIC int
  xlog_recover_add_to_trans(
         struct xlog             *log,
         struct xlog_recover     *trans,
-       xfs_caddr_t             dp,
+       char                    *dp,
         int                     len)
  {
         xfs_inode_log_format_t  *in_f;                  /* any will do */
         xlog_recover_item_t     *item;
-       xfs_caddr_t             ptr;
+       char                    *ptr;
  
         if (!len)
                 return 0;
@@ -3504,7 +3513,7 @@ STATIC int
  xlog_recovery_process_trans(
         struct xlog             *log,
         struct xlog_recover     *trans,
-       xfs_caddr_t             dp,
+       char                    *dp,
         unsigned int            len,
         unsigned int            flags,
         int                     pass)
@@ -3611,8 +3620,8 @@ xlog_recover_process_ophdr(
         struct hlist_head       rhash[],
         struct xlog_rec_header  *rhead,
         struct xlog_op_header   *ohead,
-       xfs_caddr_t             dp,
-       xfs_caddr_t             end,
+       char                    *dp,
+       char                    *end,
         int                     pass)
  {
         struct xlog_recover     *trans;
@@ -3661,11 +3670,11 @@ xlog_recover_process_data(
         struct xlog             *log,
         struct hlist_head       rhash[],
         struct xlog_rec_header  *rhead,
-       xfs_caddr_t             dp,
+       char                    *dp,
         int                     pass)
  {
         struct xlog_op_header   *ohead;
-       xfs_caddr_t             end;
+       char                    *end;
         int                     num_logops;
         int                     error;
  
@@ -3751,11 +3760,11 @@ xlog_recover_process_efi(
         }
  
         set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
         return error;
  
  abort_error:
-       xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
         return error;
  }
  
@@ -3857,13 +3866,13 @@ xlog_recover_clear_agi_bucket(
         xfs_trans_log_buf(tp, agibp, offset,
                           (offset + sizeof(xfs_agino_t) - 1));
  
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
         if (error)
                 goto out_error;
         return;
  
  out_abort:
-       xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
  out_error:
         xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
         return;
@@ -4010,7 +4019,7 @@ xlog_recover_process_iunlinks(
  STATIC int
  xlog_unpack_data_crc(
         struct xlog_rec_header  *rhead,
-       xfs_caddr_t             dp,
+       char                    *dp,
         struct xlog             *log)
  {
         __le32                  crc;
@@ -4040,7 +4049,7 @@ xlog_unpack_data_crc(
  STATIC int
  xlog_unpack_data(
         struct xlog_rec_header  *rhead,
-       xfs_caddr_t             dp,
+       char                    *dp,
         struct xlog             *log)
  {
         int                     i, j, k;
@@ -4122,7 +4131,7 @@ xlog_do_recovery_pass(
  {
         xlog_rec_header_t       *rhead;
         xfs_daddr_t             blk_no;
-       xfs_caddr_t             offset;
+       char                    *offset;
         xfs_buf_t               *hbp, *dbp;
         int                     error = 0, h_size;
         int                     bblks, split_bblks;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c

index 6f23fbd..461e791 100644 (file)
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -724,6 +724,22 @@ xfs_mountfs(
                         mp->m_inode_cluster_size = new_size;
         }
  
+       /*
+        * If enabled, sparse inode chunk alignment is expected to match the
+        * cluster size. Full inode chunk alignment must match the chunk size,
+        * but that is checked on sb read verification...
+        */
+       if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
+           mp->m_sb.sb_spino_align !=
+                       XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
+               xfs_warn(mp,
+       "Sparse inode block alignment (%u) must match cluster size (%llu).",
+                        mp->m_sb.sb_spino_align,
+                        XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
+               error = -EINVAL;
+               goto out_remove_uuid;
+       }
+
         /*
          * Set inode alignment fields
          */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h

index 8c995a2..7999e91 100644 (file)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -101,6 +101,8 @@ typedef struct xfs_mount {
         __uint64_t              m_flags;        /* global mount flags */
         int                     m_ialloc_inos;  /* inodes in inode allocation */
         int                     m_ialloc_blks;  /* blocks in inode allocation */
+       int                     m_ialloc_min_blks;/* min blocks in sparse inode
+                                                  * allocation */
         int                     m_inoalign_mask;/* mask sb_inoalignmt if used */
         uint                    m_qflags;       /* quota status flags */
         struct xfs_trans_resv   m_resv;         /* precomputed res values */
@@ -179,6 +181,8 @@ typedef struct xfs_mount {
                                                    allocator */
  #define XFS_MOUNT_NOATTR2      (1ULL << 25)    /* disable use of attr2 format */
  
+#define XFS_MOUNT_DAX          (1ULL << 62)    /* TEST ONLY! */
+
  
  /*
   * Default minimum read and write sizes.
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c

index 981a657..ab4a606 100644 (file)
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -306,7 +306,7 @@ xfs_fs_commit_blocks(
         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 goto out_drop_iolock;
         }
  
@@ -321,7 +321,7 @@ xfs_fs_commit_blocks(
         }
  
         xfs_trans_set_sync(tp);
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
  
  out_drop_iolock:
         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c

index 5538468..eac9549 100644 (file)
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -756,7 +756,7 @@ xfs_qm_qino_alloc(
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
                                   XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return error;
         }
  
@@ -764,8 +764,7 @@ xfs_qm_qino_alloc(
                 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
                                                                 &committed);
                 if (error) {
-                       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-                                        XFS_TRANS_ABORT);
+                       xfs_trans_cancel(tp);
                         return error;
                 }
         }
@@ -796,7 +795,7 @@ xfs_qm_qino_alloc(
         spin_unlock(&mp->m_sb_lock);
         xfs_log_sb(tp);
  
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
         if (error) {
                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
                 xfs_alert(mp, "%s failed (error %d)!", __func__, error);
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c

index 9a25c92..3640c6e 100644 (file)
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -239,7 +239,7 @@ xfs_qm_scall_trunc_qfile(
         tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
                 goto out_put;
         }
@@ -252,15 +252,14 @@ xfs_qm_scall_trunc_qfile(
  
         error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
         if (error) {
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-                                    XFS_TRANS_ABORT);
+               xfs_trans_cancel(tp);
                 goto out_unlock;
         }
  
         ASSERT(ip->i_d.di_nextents == 0);
  
         xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
  
  out_unlock:
         xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
@@ -437,7 +436,7 @@ xfs_qm_scall_setqlim(
         tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 goto out_rele;
         }
  
@@ -548,7 +547,7 @@ xfs_qm_scall_setqlim(
         dqp->dq_flags |= XFS_DQ_DIRTY;
         xfs_trans_log_dquot(tp, dqp);
  
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
  
  out_rele:
         xfs_qm_dqrele(dqp);
@@ -571,7 +570,7 @@ xfs_qm_log_quotaoff_end(
  
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return error;
         }
  
@@ -585,8 +584,7 @@ xfs_qm_log_quotaoff_end(
          * We don't care about quotoff's performance.
          */
         xfs_trans_set_sync(tp);
-       error = xfs_trans_commit(tp, 0);
-       return error;
+       return xfs_trans_commit(tp);
  }
  
  
@@ -605,7 +603,7 @@ xfs_qm_log_quotaoff(
         tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 goto out;
         }
  
@@ -624,7 +622,7 @@ xfs_qm_log_quotaoff(
          * We don't care about quotoff's performance.
          */
         xfs_trans_set_sync(tp);
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
         if (error)
                 goto out;
  
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h

index 5376dd4..ce6506a 100644 (file)
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -55,7 +55,6 @@ struct xfs_trans;
  typedef struct xfs_dqtrx {
         struct xfs_dquot *qt_dquot;       /* the dquot this refers to */
         ulong           qt_blk_res;       /* blks reserved on a dquot */
-       ulong           qt_blk_res_used;  /* blks used from the reservation */
         ulong           qt_ino_res;       /* inode reserved on a dquot */
         ulong           qt_ino_res_used;  /* inodes used from the reservation */
         long            qt_bcount_delta;  /* dquot blk count changes */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c

index f2079b6..f4e8c06 100644 (file)
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -780,7 +780,6 @@ xfs_growfs_rt_alloc(
          * Allocate space to the file, as necessary.
          */
         while (oblocks < nblocks) {
-               int             cancelflags = 0;
                 xfs_trans_t     *tp;
  
                 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
@@ -792,7 +791,6 @@ xfs_growfs_rt_alloc(
                                           resblks, 0);
                 if (error)
                         goto error_cancel;
-               cancelflags = XFS_TRANS_RELEASE_LOG_RES;
                 /*
                  * Lock the inode.
                  */
@@ -804,7 +802,6 @@ xfs_growfs_rt_alloc(
                  * Allocate blocks to the bitmap file.
                  */
                 nmap = 1;
-               cancelflags |= XFS_TRANS_ABORT;
                 error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
                                         XFS_BMAPI_METADATA, &firstblock,
                                         resblks, &map, &nmap, &flist);
@@ -818,14 +815,13 @@ xfs_growfs_rt_alloc(
                 error = xfs_bmap_finish(&tp, &flist, &committed);
                 if (error)
                         goto error_cancel;
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               error = xfs_trans_commit(tp);
                 if (error)
                         goto error;
                 /*
                  * Now we need to clear the allocated blocks.
                  * Do this one block per transaction, to keep it simple.
                  */
-               cancelflags = 0;
                 for (bno = map.br_startoff, fsbno = map.br_startblock;
                      bno < map.br_startoff + map.br_blockcount;
                      bno++, fsbno++) {
@@ -851,7 +847,7 @@ xfs_growfs_rt_alloc(
                         if (bp == NULL) {
                                 error = -EIO;
  error_cancel:
-                               xfs_trans_cancel(tp, cancelflags);
+                               xfs_trans_cancel(tp);
                                 goto error;
                         }
                         memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
@@ -859,7 +855,7 @@ error_cancel:
                         /*
                          * Commit the transaction.
                          */
-                       error = xfs_trans_commit(tp, 0);
+                       error = xfs_trans_commit(tp);
                         if (error)
                                 goto error;
                 }
@@ -973,7 +969,6 @@ xfs_growfs_rt(
              bmbno < nrbmblocks;
              bmbno++) {
                 xfs_trans_t     *tp;
-               int             cancelflags = 0;
  
                 *nmp = *mp;
                 nsbp = &nmp->m_sb;
@@ -1015,7 +1010,6 @@ xfs_growfs_rt(
                 mp->m_rbmip->i_d.di_size =
                         nsbp->sb_rbmblocks * nsbp->sb_blocksize;
                 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
-               cancelflags |= XFS_TRANS_ABORT;
                 /*
                  * Get the summary inode into the transaction.
                  */
@@ -1062,7 +1056,7 @@ xfs_growfs_rt(
                         nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
                 if (error) {
  error_cancel:
-                       xfs_trans_cancel(tp, cancelflags);
+                       xfs_trans_cancel(tp);
                         break;
                 }
                 /*
@@ -1076,7 +1070,7 @@ error_cancel:
                 mp->m_rsumlevels = nrsumlevels;
                 mp->m_rsumsize = nrsumsize;
  
-               error = xfs_trans_commit(tp, 0);
+               error = xfs_trans_commit(tp);
                 if (error)
                         break;
         }
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index 858e1e6..1fb1656 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj;        /* global debug sysfs attrs */
  #define MNTOPT_DISCARD    "discard"    /* Discard unused blocks */
  #define MNTOPT_NODISCARD   "nodiscard" /* Do not discard unused blocks */
  
+#define MNTOPT_DAX     "dax"           /* Enable direct access to bdev pages */
+
  /*
   * Table driven mount option parser.
   *
@@ -363,6 +365,10 @@ xfs_parseargs(
                         mp->m_flags |= XFS_MOUNT_DISCARD;
                 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
                         mp->m_flags &= ~XFS_MOUNT_DISCARD;
+#ifdef CONFIG_FS_DAX
+               } else if (!strcmp(this_char, MNTOPT_DAX)) {
+                       mp->m_flags |= XFS_MOUNT_DAX;
+#endif
                 } else {
                         xfs_warn(mp, "unknown mount option [%s].", this_char);
                         return -EINVAL;
@@ -452,8 +458,8 @@ done:
  }
  
  struct proc_xfs_info {
-       int     flag;
-       char    *str;
+       uint64_t        flag;
+       char            *str;
  };
  
  STATIC int
@@ -474,6 +480,7 @@ xfs_showargs(
                 { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
                 { XFS_MOUNT_DISCARD,            "," MNTOPT_DISCARD },
                 { XFS_MOUNT_SMALL_INUMS,        "," MNTOPT_32BITINODE },
+               { XFS_MOUNT_DAX,                "," MNTOPT_DAX },
                 { 0, NULL }
         };
         static struct proc_xfs_info xfs_info_unset[] = {
@@ -1507,6 +1514,20 @@ xfs_fs_fill_super(
         if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
                 sb->s_flags |= MS_I_VERSION;
  
+       if (mp->m_flags & XFS_MOUNT_DAX) {
+               xfs_warn(mp,
+       "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+               if (sb->s_blocksize != PAGE_SIZE) {
+                       xfs_alert(mp,
+               "Filesystem block size invalid for DAX Turning DAX off.");
+                       mp->m_flags &= ~XFS_MOUNT_DAX;
+               } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
+                       xfs_alert(mp,
+               "Block device does not support DAX Turning DAX off.");
+                       mp->m_flags &= ~XFS_MOUNT_DAX;
+               }
+       }
+
         error = xfs_mountfs(mp);
         if (error)
                 goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c

index 3df411e..4be27b0 100644 (file)
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -104,7 +104,7 @@ xfs_readlink_bmap(
                         cur_chunk += sizeof(struct xfs_dsymlink_hdr);
                 }
  
-               memcpy(link + offset, bp->b_addr, byte_cnt);
+               memcpy(link + offset, cur_chunk, byte_cnt);
  
                 pathlen -= byte_cnt;
                 offset += byte_cnt;
@@ -178,7 +178,6 @@ xfs_symlink(
         struct xfs_bmap_free    free_list;
         xfs_fsblock_t           first_block;
         bool                    unlock_dp_on_error = false;
-       uint                    cancel_flags;
         int                     committed;
         xfs_fileoff_t           first_fsb;
         xfs_filblks_t           fs_blocks;
@@ -224,7 +223,6 @@ xfs_symlink(
                 return error;
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
         /*
          * The symlink will fit into the inode data fork?
          * There can't be any attributes so we get the whole variable part.
@@ -239,10 +237,8 @@ xfs_symlink(
                 resblks = 0;
                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
         }
-       if (error) {
-               cancel_flags = 0;
+       if (error)
                 goto out_trans_cancel;
-       }
  
         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
         unlock_dp_on_error = true;
@@ -394,7 +390,7 @@ xfs_symlink(
         if (error)
                 goto out_bmap_cancel;
  
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
         if (error)
                 goto out_release_inode;
  
@@ -407,9 +403,8 @@ xfs_symlink(
  
  out_bmap_cancel:
         xfs_bmap_cancel(&free_list);
-       cancel_flags |= XFS_TRANS_ABORT;
  out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
  out_release_inode:
         /*
          * Wait until after the current transaction is aborted to finish the
@@ -464,7 +459,7 @@ xfs_inactive_symlink_rmt(
         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
         if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                 return error;
         }
  
@@ -533,7 +528,7 @@ xfs_inactive_symlink_rmt(
         /*
          * Commit the transaction containing extent freeing and EFDs.
          */
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
         if (error) {
                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
                 goto error_unlock;
@@ -552,7 +547,7 @@ xfs_inactive_symlink_rmt(
  error_bmap_cancel:
         xfs_bmap_cancel(&free_list);
  error_trans_cancel:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
  error_unlock:
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return error;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index 615781b..8d916d3 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -738,6 +738,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size,
                   __entry->blocks, __entry->shift, __entry->writeio_blocks)
  )
  
+TRACE_EVENT(xfs_irec_merge_pre,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+                uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask),
+       TP_ARGS(mp, agno, agino, holemask, nagino, nholemask),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agino_t, agino)
+               __field(uint16_t, holemask)
+               __field(xfs_agino_t, nagino)
+               __field(uint16_t, nholemask)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->agino = agino;
+               __entry->holemask = holemask;
+               __entry->nagino = nagino;
+               __entry->nholemask = holemask;
+       ),
+       TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+                 __entry->agino, __entry->holemask, __entry->nagino,
+                 __entry->nholemask)
+)
+
+TRACE_EVENT(xfs_irec_merge_post,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+                uint16_t holemask),
+       TP_ARGS(mp, agno, agino, holemask),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agino_t, agino)
+               __field(uint16_t, holemask)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->agino = agino;
+               __entry->holemask = holemask;
+       ),
+       TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev),
+                 MINOR(__entry->dev), __entry->agno, __entry->agino,
+                 __entry->holemask)
+)
+
  #define DEFINE_IREF_EVENT(name) \
  DEFINE_EVENT(xfs_iref_class, name, \
         TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c

index 220ef2c..0582a27 100644 (file)
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -113,7 +113,7 @@ xfs_trans_free(
   * blocks.  Locks and log items, however, are no inherited.  They must
   * be added to the new transaction explicitly.
   */
-xfs_trans_t *
+STATIC xfs_trans_t *
  xfs_trans_dup(
         xfs_trans_t     *tp)
  {
@@ -251,14 +251,7 @@ xfs_trans_reserve(
          */
  undo_log:
         if (resp->tr_logres > 0) {
-               int             log_flags;
-
-               if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
-                       log_flags = XFS_LOG_REL_PERM_RESERV;
-               } else {
-                       log_flags = 0;
-               }
-               xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
+               xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
                 tp->t_ticket = NULL;
                 tp->t_log_res = 0;
                 tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
@@ -744,7 +737,7 @@ void
  xfs_trans_free_items(
         struct xfs_trans        *tp,
         xfs_lsn_t               commit_lsn,
-       int                     flags)
+       bool                    abort)
  {
         struct xfs_log_item_desc *lidp, *next;
  
@@ -755,7 +748,7 @@ xfs_trans_free_items(
  
                 if (commit_lsn != NULLCOMMITLSN)
                         lip->li_ops->iop_committing(lip, commit_lsn);
-               if (flags & XFS_TRANS_ABORT)
+               if (abort)
                         lip->li_flags |= XFS_LI_ABORTED;
                 lip->li_ops->iop_unlock(lip);
  
@@ -892,26 +885,16 @@ xfs_trans_committed_bulk(
   * have already been unlocked as if the commit had succeeded.
   * Do not reference the transaction structure after this call.
   */
-int
-xfs_trans_commit(
+static int
+__xfs_trans_commit(
         struct xfs_trans        *tp,
-       uint                    flags)
+       bool                    regrant)
  {
         struct xfs_mount        *mp = tp->t_mountp;
         xfs_lsn_t               commit_lsn = -1;
         int                     error = 0;
-       int                     log_flags = 0;
         int                     sync = tp->t_flags & XFS_TRANS_SYNC;
  
-       /*
-        * Determine whether this commit is releasing a permanent
-        * log reservation or not.
-        */
-       if (flags & XFS_TRANS_RELEASE_LOG_RES) {
-               ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
-               log_flags = XFS_LOG_REL_PERM_RESERV;
-       }
-
         /*
          * If there is nothing to be logged by the transaction,
          * then unlock all of the items associated with the
@@ -936,7 +919,7 @@ xfs_trans_commit(
                 xfs_trans_apply_sb_deltas(tp);
         xfs_trans_apply_dquot_deltas(tp);
  
-       xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
+       xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
  
         current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
         xfs_trans_free(tp);
@@ -964,18 +947,25 @@ out_unreserve:
          */
         xfs_trans_unreserve_and_mod_dquots(tp);
         if (tp->t_ticket) {
-               commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+               commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
                 if (commit_lsn == -1 && !error)
                         error = -EIO;
         }
         current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-       xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
+       xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
         xfs_trans_free(tp);
  
         XFS_STATS_INC(xs_trans_empty);
         return error;
  }
  
+int
+xfs_trans_commit(
+       struct xfs_trans        *tp)
+{
+       return __xfs_trans_commit(tp, false);
+}
+
  /*
   * Unlock all of the transaction's items and free the transaction.
   * The transaction must not have modified any of its items, because
@@ -986,29 +976,22 @@ out_unreserve:
   */
  void
  xfs_trans_cancel(
-       xfs_trans_t             *tp,
-       int                     flags)
+       struct xfs_trans        *tp)
  {
-       int                     log_flags;
-       xfs_mount_t             *mp = tp->t_mountp;
+       struct xfs_mount        *mp = tp->t_mountp;
+       bool                    dirty = (tp->t_flags & XFS_TRANS_DIRTY);
  
-       /*
-        * See if the caller is being too lazy to figure out if
-        * the transaction really needs an abort.
-        */
-       if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
-               flags &= ~XFS_TRANS_ABORT;
         /*
          * See if the caller is relying on us to shut down the
          * filesystem.  This happens in paths where we detect
          * corruption and decide to give up.
          */
-       if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
+       if (dirty && !XFS_FORCED_SHUTDOWN(mp)) {
                 XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
                 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
         }
  #ifdef DEBUG
-       if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
+       if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) {
                 struct xfs_log_item_desc *lidp;
  
                 list_for_each_entry(lidp, &tp->t_items, lid_trans)
@@ -1018,27 +1001,20 @@ xfs_trans_cancel(
         xfs_trans_unreserve_and_mod_sb(tp);
         xfs_trans_unreserve_and_mod_dquots(tp);
  
-       if (tp->t_ticket) {
-               if (flags & XFS_TRANS_RELEASE_LOG_RES) {
-                       ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
-                       log_flags = XFS_LOG_REL_PERM_RESERV;
-               } else {
-                       log_flags = 0;
-               }
-               xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
-       }
+       if (tp->t_ticket)
+               xfs_log_done(mp, tp->t_ticket, NULL, false);
  
         /* mark this thread as no longer being in a transaction */
         current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
  
-       xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
+       xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
         xfs_trans_free(tp);
  }
  
  /*
   * Roll from one trans in the sequence of PERMANENT transactions to
   * the next: permanent transactions are only flushed out when
- * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * committed with xfs_trans_commit(), but we still want as soon
   * as possible to let chunks of it go to the log. So we commit the
   * chunk we've been working on and get a new transaction to continue.
   */
@@ -1055,7 +1031,8 @@ xfs_trans_roll(
          * Ensure that the inode is always logged.
          */
         trans = *tpp;
-       xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+       if (dp)
+               xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
  
         /*
          * Copy the critical parameters from one trans to the next.
@@ -1071,19 +1048,12 @@ xfs_trans_roll(
          * is in progress. The caller takes the responsibility to cancel
          * the duplicate transaction that gets returned.
          */
-       error = xfs_trans_commit(trans, 0);
+       error = __xfs_trans_commit(trans, true);
         if (error)
                 return error;
  
         trans = *tpp;
  
-       /*
-        * transaction commit worked ok so we can drop the extra ticket
-        * reference that we gained in xfs_trans_dup()
-        */
-       xfs_log_ticket_put(trans->t_ticket);
-
-
         /*
          * Reserve space in the log for th next transaction.
          * This also pushes items in the "AIL", the list of logged items,
@@ -1100,6 +1070,7 @@ xfs_trans_roll(
         if (error)
                 return error;
  
-       xfs_trans_ijoin(trans, dp, 0);
+       if (dp)
+               xfs_trans_ijoin(trans, dp, 0);
         return 0;
  }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h

index b5bc1ab..3b21b4e 100644 (file)
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -133,8 +133,6 @@ typedef struct xfs_trans {
   * XFS transaction mechanism exported interfaces that are
   * actually macros.
   */
-#define        xfs_trans_get_log_res(tp)       ((tp)->t_log_res)
-#define        xfs_trans_get_log_count(tp)     ((tp)->t_log_count)
  #define        xfs_trans_get_block_res(tp)     ((tp)->t_blk_res)
  #define        xfs_trans_set_sync(tp)          ((tp)->t_flags |= XFS_TRANS_SYNC)
  
@@ -153,7 +151,6 @@ typedef struct xfs_trans {
   */
  xfs_trans_t    *xfs_trans_alloc(struct xfs_mount *, uint);
  xfs_trans_t    *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
-xfs_trans_t    *xfs_trans_dup(xfs_trans_t *);
  int            xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
                                   uint, uint);
  void           xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
@@ -228,9 +225,9 @@ void                xfs_trans_log_efd_extent(xfs_trans_t *,
                                          struct xfs_efd_log_item *,
                                          xfs_fsblock_t,
                                          xfs_extlen_t);
-int            xfs_trans_commit(xfs_trans_t *, uint flags);
+int            xfs_trans_commit(struct xfs_trans *);
  int            xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
-void           xfs_trans_cancel(xfs_trans_t *, int);
+void           xfs_trans_cancel(xfs_trans_t *);
  int            xfs_trans_ail_init(struct xfs_mount *);
  void           xfs_trans_ail_destroy(struct xfs_mount *);
  
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c

index 573aefb..1098cf4 100644 (file)
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -159,7 +159,7 @@ xfs_trans_ail_cursor_next(
  {
         struct xfs_log_item     *lip = cur->item;
  
-       if ((__psint_t)lip & 1)
+       if ((uintptr_t)lip & 1)
                 lip = xfs_ail_min(ailp);
         if (lip)
                 cur->item = xfs_ail_next(ailp, lip);
@@ -196,7 +196,7 @@ xfs_trans_ail_cursor_clear(
         list_for_each_entry(cur, &ailp->xa_cursors, list) {
                 if (cur->item == lip)
                         cur->item = (struct xfs_log_item *)
-                                       ((__psint_t)cur->item | 1);
+                                       ((uintptr_t)cur->item | 1);
         }
  }
  
@@ -287,7 +287,7 @@ xfs_ail_splice(
          * find the place in the AIL where the items belong.
          */
         lip = cur ? cur->item : NULL;
-       if (!lip || (__psint_t) lip & 1)
+       if (!lip || (uintptr_t)lip & 1)
                 lip = __xfs_trans_ail_cursor_last(ailp, lsn);
  
         /*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c

index 76a16df..ce78534 100644 (file)
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -90,8 +90,9 @@ xfs_trans_dup_dqinfo(
         xfs_trans_t     *ntp)
  {
         xfs_dqtrx_t     *oq, *nq;
-       int             i,j;
+       int             i, j;
         xfs_dqtrx_t     *oqa, *nqa;
+       ulong           blk_res_used;
  
         if (!otp->t_dqinfo)
                 return;
@@ -102,18 +103,23 @@ xfs_trans_dup_dqinfo(
          * Because the quota blk reservation is carried forward,
          * it is also necessary to carry forward the DQ_DIRTY flag.
          */
-       if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
+       if (otp->t_flags & XFS_TRANS_DQ_DIRTY)
                 ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
  
         for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
                 oqa = otp->t_dqinfo->dqs[j];
                 nqa = ntp->t_dqinfo->dqs[j];
                 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+                       blk_res_used = 0;
+
                         if (oqa[i].qt_dquot == NULL)
                                 break;
                         oq = &oqa[i];
                         nq = &nqa[i];
  
+                       if (oq->qt_blk_res && oq->qt_bcount_delta > 0)
+                               blk_res_used = oq->qt_bcount_delta;
+
                         nq->qt_dquot = oq->qt_dquot;
                         nq->qt_bcount_delta = nq->qt_icount_delta = 0;
                         nq->qt_rtbcount_delta = 0;
@@ -121,8 +127,8 @@ xfs_trans_dup_dqinfo(
                         /*
                          * Transfer whatever is left of the reservations.
                          */
-                       nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
-                       oq->qt_blk_res = oq->qt_blk_res_used;
+                       nq->qt_blk_res = oq->qt_blk_res - blk_res_used;
+                       oq->qt_blk_res = blk_res_used;
  
                         nq->qt_rtblk_res = oq->qt_rtblk_res -
                                 oq->qt_rtblk_res_used;
@@ -239,10 +245,6 @@ xfs_trans_mod_dquot(
                  * disk blocks used.
                  */
               case XFS_TRANS_DQ_BCOUNT:
-               if (qtrx->qt_blk_res && delta > 0) {
-                       qtrx->qt_blk_res_used += (ulong)delta;
-                       ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
-               }
                 qtrx->qt_bcount_delta += delta;
                 break;
  
@@ -423,15 +425,19 @@ xfs_trans_apply_dquot_deltas(
                          * reservation that a transaction structure knows of.
                          */
                         if (qtrx->qt_blk_res != 0) {
-                               if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
-                                       if (qtrx->qt_blk_res >
-                                           qtrx->qt_blk_res_used)
+                               ulong blk_res_used = 0;
+
+                               if (qtrx->qt_bcount_delta > 0)
+                                       blk_res_used = qtrx->qt_bcount_delta;
+
+                               if (qtrx->qt_blk_res != blk_res_used) {
+                                       if (qtrx->qt_blk_res > blk_res_used)
                                                 dqp->q_res_bcount -= (xfs_qcnt_t)
                                                         (qtrx->qt_blk_res -
-                                                        qtrx->qt_blk_res_used);
+                                                        blk_res_used);
                                         else
                                                 dqp->q_res_bcount -= (xfs_qcnt_t)
-                                                       (qtrx->qt_blk_res_used -
+                                                       (blk_res_used -
                                                          qtrx->qt_blk_res);
                                 }
                         } else {
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h

index bd12818..1b73629 100644 (file)
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -30,7 +30,7 @@ void  xfs_trans_init(struct xfs_mount *);
  void   xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
  void   xfs_trans_del_item(struct xfs_log_item *);
  void   xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
-                               int flags);
+                               bool abort);
  void   xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
  
  void   xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
diff --git a/include/linux/fs.h b/include/linux/fs.h

index e351da4..3f1a846 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                         struct buffer_head *bh_result, int create);
  typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
                         ssize_t bytes, void *private);
+typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
  
  #define MAY_EXEC               0x00000001
  #define MAY_WRITE              0x00000002
@@ -2655,9 +2656,13 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
  int dax_clear_blocks(struct inode *, sector_t block, long size);
  int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
  int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+               dax_iodone_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+               dax_iodone_t);
  int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb)      dax_fault(vma, vmf, gb)
+#define dax_mkwrite(vma, vmf, gb, iod)         dax_fault(vma, vmf, gb, iod)
+#define __dax_mkwrite(vma, vmf, gb, iod)       __dax_fault(vma, vmf, gb, iod)
  
  #ifdef CONFIG_BLOCK
  typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 1 Jul 2015 03:16:08 +0000 (20:16 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 1 Jul 2015 03:16:08 +0000 (20:16 -0700)
Documentation/filesystems/xfs.txt		patch \| blob \| history
fs/dax.c		patch \| blob \| history
fs/ext2/file.c		patch \| blob \| history
fs/ext4/file.c		patch \| blob \| history
fs/ext4/inode.c		patch \| blob \| history
fs/xfs/libxfs/xfs_alloc.c		patch \| blob \| history
fs/xfs/libxfs/xfs_alloc.h		patch \| blob \| history
fs/xfs/libxfs/xfs_attr.c		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.c		patch \| blob \| history
fs/xfs/libxfs/xfs_format.h		patch \| blob \| history
fs/xfs/libxfs/xfs_fs.h		patch \| blob \| history
fs/xfs/libxfs/xfs_ialloc.c		patch \| blob \| history
fs/xfs/libxfs/xfs_ialloc.h		patch \| blob \| history
fs/xfs/libxfs/xfs_ialloc_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_ialloc_btree.h		patch \| blob \| history
fs/xfs/libxfs/xfs_inode_buf.c		patch \| blob \| history
fs/xfs/libxfs/xfs_sb.c		patch \| blob \| history
fs/xfs/libxfs/xfs_shared.h		patch \| blob \| history
fs/xfs/libxfs/xfs_trans_resv.h		patch \| blob \| history
fs/xfs/libxfs/xfs_trans_space.h		patch \| blob \| history
fs/xfs/xfs_aops.c		patch \| blob \| history
fs/xfs/xfs_aops.h		patch \| blob \| history
fs/xfs/xfs_attr_inactive.c		patch \| blob \| history
fs/xfs/xfs_bmap_util.c		patch \| blob \| history
fs/xfs/xfs_buf.c		patch \| blob \| history
fs/xfs/xfs_buf.h		patch \| blob \| history
fs/xfs/xfs_dquot.c		patch \| blob \| history
fs/xfs/xfs_error.c		patch \| blob \| history
fs/xfs/xfs_error.h		patch \| blob \| history
fs/xfs/xfs_extfree_item.c		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_filestream.c		patch \| blob \| history
fs/xfs/xfs_fsops.c		patch \| blob \| history
fs/xfs/xfs_inode.c		patch \| blob \| history
fs/xfs/xfs_ioctl.c		patch \| blob \| history
fs/xfs/xfs_iomap.c		patch \| blob \| history
fs/xfs/xfs_iops.c		patch \| blob \| history
fs/xfs/xfs_itable.c		patch \| blob \| history
fs/xfs/xfs_linux.h		patch \| blob \| history
fs/xfs/xfs_log.c		patch \| blob \| history
fs/xfs/xfs_log.h		patch \| blob \| history
fs/xfs/xfs_log_cil.c		patch \| blob \| history
fs/xfs/xfs_log_priv.h		patch \| blob \| history
fs/xfs/xfs_log_recover.c		patch \| blob \| history
fs/xfs/xfs_mount.c		patch \| blob \| history
fs/xfs/xfs_mount.h		patch \| blob \| history
fs/xfs/xfs_pnfs.c		patch \| blob \| history
fs/xfs/xfs_qm.c		patch \| blob \| history
fs/xfs/xfs_qm_syscalls.c		patch \| blob \| history
fs/xfs/xfs_quota.h		patch \| blob \| history
fs/xfs/xfs_rtalloc.c		patch \| blob \| history
fs/xfs/xfs_super.c		patch \| blob \| history
fs/xfs/xfs_symlink.c		patch \| blob \| history
fs/xfs/xfs_trace.h		patch \| blob \| history
fs/xfs/xfs_trans.c		patch \| blob \| history
fs/xfs/xfs_trans.h		patch \| blob \| history
fs/xfs/xfs_trans_ail.c		patch \| blob \| history
fs/xfs/xfs_trans_dquot.c		patch \| blob \| history
fs/xfs/xfs_trans_priv.h		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history