Merge tag 'xfs-for-linus-4.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Jul 2015 03:16:08 +0000 (20:16 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Jul 2015 03:16:08 +0000 (20:16 -0700)
Pul xfs updates from Dave Chinner:
 "There's a couple of small API changes to the core DAX code which
  required small changes to the ext2 and ext4 code bases, but otherwise
  everything is within the XFS codebase.

  This update contains:

   - A new sparse on-disk inode record format to allow small extents to
     be used for inode allocation when free space is fragmented.

   - DAX support.  This includes minor changes to the DAX core code to
     fix problems with lock ordering and bufferhead mapping abuse.

   - transaction commit interface cleanup

   - removal of various unnecessary XFS specific type definitions

   - cleanup and optimisation of freelist preparation before allocation

   - various minor cleanups

   - bug fixes for
- transaction reservation leaks
- incorrect inode logging in unwritten extent conversion
- mmap lock vs freeze ordering
- remote symlink mishandling
- attribute fork removal issues"

* tag 'xfs-for-linus-4.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (49 commits)
  xfs: don't truncate attribute extents if no extents exist
  xfs: clean up XFS_MIN_FREELIST macros
  xfs: sanitise error handling in xfs_alloc_fix_freelist
  xfs: factor out free space extent length check
  xfs: xfs_alloc_fix_freelist() can use incore perag structures
  xfs: remove xfs_caddr_t
  xfs: use void pointers in log validation helpers
  xfs: return a void pointer from xfs_buf_offset
  xfs: remove inst_t
  xfs: remove __psint_t and __psunsigned_t
  xfs: fix remote symlinks on V5/CRC filesystems
  xfs: fix xfs_log_done interface
  xfs: saner xfs_trans_commit interface
  xfs: remove the flags argument to xfs_trans_cancel
  xfs: pass a boolean flag to xfs_trans_free_items
  xfs: switch remaining xfs_trans_dup users to xfs_trans_roll
  xfs: check min blks for random debug mode sparse allocations
  xfs: fix sparse inodes 32-bit compile failure
  xfs: add initial DAX support
  xfs: add DAX IO path support
  ...

60 files changed:
Documentation/filesystems/xfs.txt
fs/dax.c
fs/ext2/file.c
fs/ext4/file.c
fs/ext4/inode.c
fs/xfs/libxfs/xfs_alloc.c
fs/xfs/libxfs/xfs_alloc.h
fs/xfs/libxfs/xfs_attr.c
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/libxfs/xfs_format.h
fs/xfs/libxfs/xfs_fs.h
fs/xfs/libxfs/xfs_ialloc.c
fs/xfs/libxfs/xfs_ialloc.h
fs/xfs/libxfs/xfs_ialloc_btree.c
fs/xfs/libxfs/xfs_ialloc_btree.h
fs/xfs/libxfs/xfs_inode_buf.c
fs/xfs/libxfs/xfs_sb.c
fs/xfs/libxfs/xfs_shared.h
fs/xfs/libxfs/xfs_trans_resv.h
fs/xfs/libxfs/xfs_trans_space.h
fs/xfs/xfs_aops.c
fs/xfs/xfs_aops.h
fs/xfs/xfs_attr_inactive.c
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf.h
fs/xfs/xfs_dquot.c
fs/xfs/xfs_error.c
fs/xfs/xfs_error.h
fs/xfs/xfs_extfree_item.c
fs/xfs/xfs_file.c
fs/xfs/xfs_filestream.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iops.c
fs/xfs/xfs_itable.c
fs/xfs/xfs_linux.h
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_log_cil.c
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_pnfs.c
fs/xfs/xfs_qm.c
fs/xfs/xfs_qm_syscalls.c
fs/xfs/xfs_quota.h
fs/xfs/xfs_rtalloc.c
fs/xfs/xfs_super.c
fs/xfs/xfs_symlink.c
fs/xfs/xfs_trace.h
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
fs/xfs/xfs_trans_ail.c
fs/xfs/xfs_trans_dquot.c
fs/xfs/xfs_trans_priv.h
include/linux/fs.h

index 5a5a055..8146e9f 100644 (file)
@@ -236,10 +236,10 @@ Removed Mount Options
 
   Name                         Removed
   ----                         -------
-  delaylog/nodelaylog          v3.20
-  ihashsize                    v3.20
-  irixsgid                     v3.20
-  osyncisdsync/osyncisosync    v3.20
+  delaylog/nodelaylog          v4.0
+  ihashsize                    v4.0
+  irixsgid                     v4.0
+  osyncisdsync/osyncisosync    v4.0
 
 
 sysctls
@@ -346,5 +346,5 @@ Removed Sysctls
 
   Name                         Removed
   ----                         -------
-  fs.xfs.xfsbufd_centisec      v3.20
-  fs.xfs.age_buffer_centisecs  v3.20
+  fs.xfs.xfsbufd_centisec      v4.0
+  fs.xfs.age_buffer_centisecs  v4.0
index 6f65f00..99b5fbc 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -309,14 +309,21 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
  out:
        i_mmap_unlock_read(mapping);
 
-       if (bh->b_end_io)
-               bh->b_end_io(bh, 1);
-
        return error;
 }
 
-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-                       get_block_t get_block)
+/**
+ * __dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files. __dax_fault() assumes the caller has done all
+ * the necessary locking for the page fault to proceed successfully.
+ */
+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+                       get_block_t get_block, dax_iodone_t complete_unwritten)
 {
        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
@@ -417,7 +424,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                page_cache_release(page);
        }
 
+       /*
+        * If we successfully insert the new mapping over an unwritten extent,
+        * we need to ensure we convert the unwritten extent. If there is an
+        * error inserting the mapping, the filesystem needs to leave it as
+        * unwritten to prevent exposure of the stale underlying data to
+        * userspace, but we still need to call the completion function so
+        * the private resources on the mapping buffer can be released. We
+        * indicate what the callback should do via the uptodate variable, same
+        * as for normal BH based IO completions.
+        */
        error = dax_insert_mapping(inode, &bh, vma, vmf);
+       if (buffer_unwritten(&bh))
+               complete_unwritten(&bh, !error);
 
  out:
        if (error == -ENOMEM)
@@ -434,6 +453,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        }
        goto out;
 }
+EXPORT_SYMBOL(__dax_fault);
 
 /**
  * dax_fault - handle a page fault on a DAX file
@@ -445,7 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  * fault handler for DAX files.
  */
 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-                       get_block_t get_block)
+             get_block_t get_block, dax_iodone_t complete_unwritten)
 {
        int result;
        struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -454,7 +474,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                sb_start_pagefault(sb);
                file_update_time(vma->vm_file);
        }
-       result = do_dax_fault(vma, vmf, get_block);
+       result = __dax_fault(vma, vmf, get_block, complete_unwritten);
        if (vmf->flags & FAULT_FLAG_WRITE)
                sb_end_pagefault(sb);
 
index 3a0a6c6..3b57c9f 100644 (file)
 #ifdef CONFIG_FS_DAX
 static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       return dax_fault(vma, vmf, ext2_get_block);
+       return dax_fault(vma, vmf, ext2_get_block, NULL);
 }
 
 static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       return dax_mkwrite(vma, vmf, ext2_get_block);
+       return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
 }
 
 static const struct vm_operations_struct ext2_dax_vm_ops = {
index ac517f1..bc313ac 100644 (file)
@@ -192,15 +192,27 @@ out:
 }
 
 #ifdef CONFIG_FS_DAX
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+       struct inode *inode = bh->b_assoc_map->host;
+       /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+       loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+       int err;
+       if (!uptodate)
+               return;
+       WARN_ON(!buffer_unwritten(bh));
+       err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
+
 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       return dax_fault(vma, vmf, ext4_get_block);
+       return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
                                        /* Is this the right get_block? */
 }
 
 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       return dax_mkwrite(vma, vmf, ext4_get_block);
+       return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
 }
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
index f8a8d4e..41f8e55 100644 (file)
@@ -656,18 +656,6 @@ has_zeroout:
        return retval;
 }
 
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
-       struct inode *inode = bh->b_assoc_map->host;
-       /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
-       loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
-       int err;
-       if (!uptodate)
-               return;
-       WARN_ON(!buffer_unwritten(bh));
-       err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
@@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 
                map_bh(bh, inode->i_sb, map.m_pblk);
                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-               if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+               if (IS_DAX(inode) && buffer_unwritten(bh)) {
+                       /*
+                        * dgc: I suspect unwritten conversion on ext4+DAX is
+                        * fundamentally broken here when there are concurrent
+                        * read/write in progress on this inode.
+                        */
+                       WARN_ON_ONCE(io_end);
                        bh->b_assoc_map = inode->i_mapping;
                        bh->b_private = (void *)(unsigned long)iblock;
-                       bh->b_end_io = ext4_end_io_unwritten;
                }
                if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
                        set_buffer_defer_completion(bh);
index 516162b..f9e9ffe 100644 (file)
@@ -149,13 +149,27 @@ xfs_alloc_compute_aligned(
 {
        xfs_agblock_t   bno;
        xfs_extlen_t    len;
+       xfs_extlen_t    diff;
 
        /* Trim busy sections out of found extent */
        xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
 
+       /*
+        * If we have a largish extent that happens to start before min_agbno,
+        * see if we can shift it into range...
+        */
+       if (bno < args->min_agbno && bno + len > args->min_agbno) {
+               diff = args->min_agbno - bno;
+               if (len > diff) {
+                       bno += diff;
+                       len -= diff;
+               }
+       }
+
        if (args->alignment > 1 && len >= args->minlen) {
                xfs_agblock_t   aligned_bno = roundup(bno, args->alignment);
-               xfs_extlen_t    diff = aligned_bno - bno;
+
+               diff = aligned_bno - bno;
 
                *resbno = aligned_bno;
                *reslen = diff >= len ? 0 : len - diff;
@@ -795,9 +809,13 @@ xfs_alloc_find_best_extent(
                 * The good extent is closer than this one.
                 */
                if (!dir) {
+                       if (*sbnoa > args->max_agbno)
+                               goto out_use_good;
                        if (*sbnoa >= args->agbno + gdiff)
                                goto out_use_good;
                } else {
+                       if (*sbnoa < args->min_agbno)
+                               goto out_use_good;
                        if (*sbnoa <= args->agbno - gdiff)
                                goto out_use_good;
                }
@@ -884,6 +902,17 @@ xfs_alloc_ag_vextent_near(
        dofirst = prandom_u32() & 1;
 #endif
 
+       /* handle unitialized agbno range so caller doesn't have to */
+       if (!args->min_agbno && !args->max_agbno)
+               args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
+       ASSERT(args->min_agbno <= args->max_agbno);
+
+       /* clamp agbno to the range if it's outside */
+       if (args->agbno < args->min_agbno)
+               args->agbno = args->min_agbno;
+       if (args->agbno > args->max_agbno)
+               args->agbno = args->max_agbno;
+
 restart:
        bno_cur_lt = NULL;
        bno_cur_gt = NULL;
@@ -976,6 +1005,8 @@ restart:
                                                  &ltbnoa, &ltlena);
                        if (ltlena < args->minlen)
                                continue;
+                       if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
+                               continue;
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
                        ASSERT(args->len >= args->minlen);
@@ -1096,11 +1127,11 @@ restart:
                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
                                                  &ltbnoa, &ltlena);
-                       if (ltlena >= args->minlen)
+                       if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
                                break;
                        if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
                                goto error0;
-                       if (!i) {
+                       if (!i || ltbnoa < args->min_agbno) {
                                xfs_btree_del_cursor(bno_cur_lt,
                                                     XFS_BTREE_NOERROR);
                                bno_cur_lt = NULL;
@@ -1112,11 +1143,11 @@ restart:
                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                        xfs_alloc_compute_aligned(args, gtbno, gtlen,
                                                  &gtbnoa, &gtlena);
-                       if (gtlena >= args->minlen)
+                       if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
                                break;
                        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
                                goto error0;
-                       if (!i) {
+                       if (!i || gtbnoa > args->max_agbno) {
                                xfs_btree_del_cursor(bno_cur_gt,
                                                     XFS_BTREE_NOERROR);
                                bno_cur_gt = NULL;
@@ -1216,6 +1247,7 @@ restart:
        ASSERT(ltnew >= ltbno);
        ASSERT(ltnew + rlen <= ltbnoa + ltlena);
        ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+       ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno);
        args->agbno = ltnew;
 
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
@@ -1825,11 +1857,11 @@ xfs_alloc_compute_maxlevels(
 xfs_extlen_t
 xfs_alloc_longest_free_extent(
        struct xfs_mount        *mp,
-       struct xfs_perag        *pag)
+       struct xfs_perag        *pag,
+       xfs_extlen_t            need)
 {
-       xfs_extlen_t            need, delta = 0;
+       xfs_extlen_t            delta = 0;
 
-       need = XFS_MIN_FREELIST_PAG(pag, mp);
        if (need > pag->pagf_flcount)
                delta = need - pag->pagf_flcount;
 
@@ -1838,131 +1870,150 @@ xfs_alloc_longest_free_extent(
        return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
 }
 
+unsigned int
+xfs_alloc_min_freelist(
+       struct xfs_mount        *mp,
+       struct xfs_perag        *pag)
+{
+       unsigned int            min_free;
+
+       /* space needed by-bno freespace btree */
+       min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1,
+                                      mp->m_ag_maxlevels);
+       /* space needed by-size freespace btree */
+       min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
+                                      mp->m_ag_maxlevels);
+
+       return min_free;
+}
+
+/*
+ * Check if the operation we are fixing up the freelist for should go ahead or
+ * not. If we are freeing blocks, we always allow it, otherwise the allocation
+ * is dependent on whether the size and shape of free space available will
+ * permit the requested allocation to take place.
+ */
+static bool
+xfs_alloc_space_available(
+       struct xfs_alloc_arg    *args,
+       xfs_extlen_t            min_free,
+       int                     flags)
+{
+       struct xfs_perag        *pag = args->pag;
+       xfs_extlen_t            longest;
+       int                     available;
+
+       if (flags & XFS_ALLOC_FLAG_FREEING)
+               return true;
+
+       /* do we have enough contiguous free space for the allocation? */
+       longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free);
+       if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
+               return false;
+
+       /* do have enough free space remaining for the allocation? */
+       available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
+                         min_free - args->total);
+       if (available < (int)args->minleft)
+               return false;
+
+       return true;
+}
+
 /*
  * Decide whether to use this allocation group for this allocation.
  * If so, fix up the btree freelist's size.
  */
 STATIC int                     /* error */
 xfs_alloc_fix_freelist(
-       xfs_alloc_arg_t *args,  /* allocation argument structure */
-       int             flags)  /* XFS_ALLOC_FLAG_... */
+       struct xfs_alloc_arg    *args,  /* allocation argument structure */
+       int                     flags)  /* XFS_ALLOC_FLAG_... */
 {
-       xfs_buf_t       *agbp;  /* agf buffer pointer */
-       xfs_agf_t       *agf;   /* a.g. freespace structure pointer */
-       xfs_buf_t       *agflbp;/* agfl buffer pointer */
-       xfs_agblock_t   bno;    /* freelist block */
-       xfs_extlen_t    delta;  /* new blocks needed in freelist */
-       int             error;  /* error result code */
-       xfs_extlen_t    longest;/* longest extent in allocation group */
-       xfs_mount_t     *mp;    /* file system mount point structure */
-       xfs_extlen_t    need;   /* total blocks needed in freelist */
-       xfs_perag_t     *pag;   /* per-ag information structure */
-       xfs_alloc_arg_t targs;  /* local allocation arguments */
-       xfs_trans_t     *tp;    /* transaction pointer */
-
-       mp = args->mp;
+       struct xfs_mount        *mp = args->mp;
+       struct xfs_perag        *pag = args->pag;
+       struct xfs_trans        *tp = args->tp;
+       struct xfs_buf          *agbp = NULL;
+       struct xfs_buf          *agflbp = NULL;
+       struct xfs_alloc_arg    targs;  /* local allocation arguments */
+       xfs_agblock_t           bno;    /* freelist block */
+       xfs_extlen_t            need;   /* total blocks needed in freelist */
+       int                     error;
 
-       pag = args->pag;
-       tp = args->tp;
        if (!pag->pagf_init) {
-               if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
-                               &agbp)))
-                       return error;
+               error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+               if (error)
+                       goto out_no_agbp;
                if (!pag->pagf_init) {
                        ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
                        ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-                       args->agbp = NULL;
-                       return 0;
+                       goto out_agbp_relse;
                }
-       } else
-               agbp = NULL;
+       }
 
        /*
-        * If this is a metadata preferred pag and we are user data
-        * then try somewhere else if we are not being asked to
-        * try harder at this point
+        * If this is a metadata preferred pag and we are user data then try
+        * somewhere else if we are not being asked to try harder at this
+        * point
         */
        if (pag->pagf_metadata && args->userdata &&
            (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
                ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-               args->agbp = NULL;
-               return 0;
+               goto out_agbp_relse;
        }
 
-       if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
-               /*
-                * If it looks like there isn't a long enough extent, or enough
-                * total blocks, reject it.
-                */
-               need = XFS_MIN_FREELIST_PAG(pag, mp);
-               longest = xfs_alloc_longest_free_extent(mp, pag);
-               if ((args->minlen + args->alignment + args->minalignslop - 1) >
-                               longest ||
-                   ((int)(pag->pagf_freeblks + pag->pagf_flcount -
-                          need - args->total) < (int)args->minleft)) {
-                       if (agbp)
-                               xfs_trans_brelse(tp, agbp);
-                       args->agbp = NULL;
-                       return 0;
-               }
-       }
+       need = xfs_alloc_min_freelist(mp, pag);
+       if (!xfs_alloc_space_available(args, need, flags))
+               goto out_agbp_relse;
 
        /*
         * Get the a.g. freespace buffer.
         * Can fail if we're not blocking on locks, and it's held.
         */
-       if (agbp == NULL) {
-               if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
-                               &agbp)))
-                       return error;
-               if (agbp == NULL) {
+       if (!agbp) {
+               error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+               if (error)
+                       goto out_no_agbp;
+               if (!agbp) {
                        ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
                        ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
-                       args->agbp = NULL;
-                       return 0;
-               }
-       }
-       /*
-        * Figure out how many blocks we should have in the freelist.
-        */
-       agf = XFS_BUF_TO_AGF(agbp);
-       need = XFS_MIN_FREELIST(agf, mp);
-       /*
-        * If there isn't enough total or single-extent, reject it.
-        */
-       if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
-               delta = need > be32_to_cpu(agf->agf_flcount) ?
-                       (need - be32_to_cpu(agf->agf_flcount)) : 0;
-               longest = be32_to_cpu(agf->agf_longest);
-               longest = (longest > delta) ? (longest - delta) :
-                       (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
-               if ((args->minlen + args->alignment + args->minalignslop - 1) >
-                               longest ||
-                   ((int)(be32_to_cpu(agf->agf_freeblks) +
-                    be32_to_cpu(agf->agf_flcount) - need - args->total) <
-                               (int)args->minleft)) {
-                       xfs_trans_brelse(tp, agbp);
-                       args->agbp = NULL;
-                       return 0;
+                       goto out_no_agbp;
                }
        }
+
+       /* If there isn't enough total space or single-extent, reject it. */
+       need = xfs_alloc_min_freelist(mp, pag);
+       if (!xfs_alloc_space_available(args, need, flags))
+               goto out_agbp_relse;
+
        /*
         * Make the freelist shorter if it's too long.
+        *
+        * Note that from this point onwards, we will always release the agf and
+        * agfl buffers on error. This handles the case where we error out and
+        * the buffers are clean or may not have been joined to the transaction
+        * and hence need to be released manually. If they have been joined to
+        * the transaction, then xfs_trans_brelse() will handle them
+        * appropriately based on the recursion count and dirty state of the
+        * buffer.
+        *
+        * XXX (dgc): When we have lots of free space, does this buy us
+        * anything other than extra overhead when we need to put more blocks
+        * back on the free list? Maybe we should only do this when space is
+        * getting low or the AGFL is more than half full?
         */
-       while (be32_to_cpu(agf->agf_flcount) > need) {
-               xfs_buf_t       *bp;
+       while (pag->pagf_flcount > need) {
+               struct xfs_buf  *bp;
 
                error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
                if (error)
-                       return error;
-               if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
-                       return error;
+                       goto out_agbp_relse;
+               error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1);
+               if (error)
+                       goto out_agbp_relse;
                bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
                xfs_trans_binval(tp, bp);
        }
-       /*
-        * Initialize the args structure.
-        */
+
        memset(&targs, 0, sizeof(targs));
        targs.tp = tp;
        targs.mp = mp;
@@ -1971,21 +2022,20 @@ xfs_alloc_fix_freelist(
        targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
        targs.type = XFS_ALLOCTYPE_THIS_AG;
        targs.pag = pag;
-       if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
-               return error;
-       /*
-        * Make the freelist longer if it's too short.
-        */
-       while (be32_to_cpu(agf->agf_flcount) < need) {
+       error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
+       if (error)
+               goto out_agbp_relse;
+
+       /* Make the freelist longer if it's too short. */
+       while (pag->pagf_flcount < need) {
                targs.agbno = 0;
-               targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
-               /*
-                * Allocate as many blocks as possible at once.
-                */
-               if ((error = xfs_alloc_ag_vextent(&targs))) {
-                       xfs_trans_brelse(tp, agflbp);
-                       return error;
-               }
+               targs.maxlen = need - pag->pagf_flcount;
+
+               /* Allocate as many blocks as possible at once. */
+               error = xfs_alloc_ag_vextent(&targs);
+               if (error)
+                       goto out_agflbp_relse;
+
                /*
                 * Stop if we run out.  Won't happen if callers are obeying
                 * the restrictions correctly.  Can happen for free calls
@@ -1994,9 +2044,7 @@ xfs_alloc_fix_freelist(
                if (targs.agbno == NULLAGBLOCK) {
                        if (flags & XFS_ALLOC_FLAG_FREEING)
                                break;
-                       xfs_trans_brelse(tp, agflbp);
-                       args->agbp = NULL;
-                       return 0;
+                       goto out_agflbp_relse;
                }
                /*
                 * Put each allocated block on the list.
@@ -2005,12 +2053,21 @@ xfs_alloc_fix_freelist(
                        error = xfs_alloc_put_freelist(tp, agbp,
                                                        agflbp, bno, 0);
                        if (error)
-                               return error;
+                               goto out_agflbp_relse;
                }
        }
        xfs_trans_brelse(tp, agflbp);
        args->agbp = agbp;
        return 0;
+
+out_agflbp_relse:
+       xfs_trans_brelse(tp, agflbp);
+out_agbp_relse:
+       if (agbp)
+               xfs_trans_brelse(tp, agbp);
+out_no_agbp:
+       args->agbp = NULL;
+       return error;
 }
 
 /*
index d1b4b6a..ca1c816 100644 (file)
@@ -112,6 +112,8 @@ typedef struct xfs_alloc_arg {
        xfs_extlen_t    total;          /* total blocks needed in xaction */
        xfs_extlen_t    alignment;      /* align answer to multiple of this */
        xfs_extlen_t    minalignslop;   /* slop for minlen+alignment calcs */
+       xfs_agblock_t   min_agbno;      /* set an agbno range for NEAR allocs */
+       xfs_agblock_t   max_agbno;      /* ... */
        xfs_extlen_t    len;            /* output: actual size of extent */
        xfs_alloctype_t type;           /* allocation type XFS_ALLOCTYPE_... */
        xfs_alloctype_t otype;          /* original allocation type */
@@ -128,11 +130,9 @@ typedef struct xfs_alloc_arg {
 #define XFS_ALLOC_USERDATA             1       /* allocation is for user data*/
 #define XFS_ALLOC_INITIAL_USER_DATA    2       /* special case start of file */
 
-/*
- * Find the length of the longest extent in an AG.
- */
-xfs_extlen_t
-xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+               struct xfs_perag *pag, xfs_extlen_t need);
+unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
                struct xfs_perag *pag);
 
 /*
index 0a472fb..3349c9a 100644 (file)
@@ -266,7 +266,7 @@ xfs_attr_set(
        tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
        error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
        if (error) {
-               xfs_trans_cancel(args.trans, 0);
+               xfs_trans_cancel(args.trans);
                return error;
        }
        xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -276,7 +276,7 @@ xfs_attr_set(
                                       XFS_QMOPT_RES_REGBLKS);
        if (error) {
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
-               xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+               xfs_trans_cancel(args.trans);
                return error;
        }
 
@@ -320,8 +320,7 @@ xfs_attr_set(
                                xfs_trans_ichgtime(args.trans, dp,
                                                        XFS_ICHGTIME_CHG);
                        }
-                       err2 = xfs_trans_commit(args.trans,
-                                                XFS_TRANS_RELEASE_LOG_RES);
+                       err2 = xfs_trans_commit(args.trans);
                        xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
                        return error ? error : err2;
@@ -383,16 +382,14 @@ xfs_attr_set(
         * Commit the last in the sequence of transactions.
         */
        xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
-       error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(args.trans);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
        return error;
 
 out:
-       if (args.trans) {
-               xfs_trans_cancel(args.trans,
-                       XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-       }
+       if (args.trans)
+               xfs_trans_cancel(args.trans);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
        return error;
 }
@@ -462,7 +459,7 @@ xfs_attr_remove(
        error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
                                  XFS_ATTRRM_SPACE_RES(mp), 0);
        if (error) {
-               xfs_trans_cancel(args.trans, 0);
+               xfs_trans_cancel(args.trans);
                return error;
        }
 
@@ -501,16 +498,14 @@ xfs_attr_remove(
         * Commit the last in the sequence of transactions.
         */
        xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
-       error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(args.trans);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
        return error;
 
 out:
-       if (args.trans) {
-               xfs_trans_cancel(args.trans,
-                       XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-       }
+       if (args.trans)
+               xfs_trans_cancel(args.trans);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
        return error;
 }
index f1026e8..63e05b6 100644 (file)
@@ -1112,7 +1112,6 @@ xfs_bmap_add_attrfork(
        int                     committed;      /* xaction was committed */
        int                     logflags;       /* logging flags */
        int                     error;          /* error return value */
-       int                     cancel_flags = 0;
 
        ASSERT(XFS_IFORK_Q(ip) == 0);
 
@@ -1124,17 +1123,15 @@ xfs_bmap_add_attrfork(
                tp->t_flags |= XFS_TRANS_RESERVE;
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
                        XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
                        XFS_QMOPT_RES_REGBLKS);
        if (error)
                goto trans_cancel;
-       cancel_flags |= XFS_TRANS_ABORT;
        if (XFS_IFORK_Q(ip))
                goto trans_cancel;
        if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
@@ -1218,14 +1215,14 @@ xfs_bmap_add_attrfork(
        error = xfs_bmap_finish(&tp, &flist, &committed);
        if (error)
                goto bmap_cancel;
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 
 bmap_cancel:
        xfs_bmap_cancel(&flist);
 trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 }
@@ -3521,7 +3518,8 @@ xfs_bmap_longest_free_extent(
                }
        }
 
-       longest = xfs_alloc_longest_free_extent(mp, pag);
+       longest = xfs_alloc_longest_free_extent(mp, pag,
+                                       xfs_alloc_min_freelist(mp, pag));
        if (*blen < longest)
                *blen = longest;
 
@@ -4424,7 +4422,15 @@ xfs_bmapi_convert_unwritten(
        error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
                        &bma->cur, mval, bma->firstblock, bma->flist,
                        &tmp_logflags);
-       bma->logflags |= tmp_logflags;
+       /*
+        * Log the inode core unconditionally in the unwritten extent conversion
+        * path because the conversion might not have done so (e.g., if the
+        * extent count hasn't changed). We need to make sure the inode is dirty
+        * in the transaction for the sake of fsync(), even if nothing has
+        * changed, because fsync() will not force the log for this transaction
+        * unless it sees the inode pinned.
+        */
+       bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
        if (error)
                return error;
 
@@ -5918,7 +5924,7 @@ xfs_bmap_split_extent(
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
                        XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -5936,10 +5942,9 @@ xfs_bmap_split_extent(
        if (error)
                goto out;
 
-       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
+       return xfs_trans_commit(tp);
 
 out:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
        return error;
 }
index 4daaa66..a0ae572 100644 (file)
@@ -170,7 +170,7 @@ typedef struct xfs_sb {
        __uint32_t      sb_features_log_incompat;
 
        __uint32_t      sb_crc;         /* superblock crc */
-       __uint32_t      sb_pad;
+       xfs_extlen_t    sb_spino_align; /* sparse inode chunk alignment */
 
        xfs_ino_t       sb_pquotino;    /* project quota inode */
        xfs_lsn_t       sb_lsn;         /* last write sequence */
@@ -256,7 +256,7 @@ typedef struct xfs_dsb {
        __be32          sb_features_log_incompat;
 
        __le32          sb_crc;         /* superblock crc */
-       __be32          sb_pad;
+       __be32          sb_spino_align; /* sparse inode chunk alignment */
 
        __be64          sb_pquotino;    /* project quota inode */
        __be64          sb_lsn;         /* last write sequence */
@@ -457,8 +457,10 @@ xfs_sb_has_ro_compat_feature(
 }
 
 #define XFS_SB_FEAT_INCOMPAT_FTYPE     (1 << 0)        /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_SPINODES  (1 << 1)        /* sparse inode chunks */
 #define XFS_SB_FEAT_INCOMPAT_ALL \
-               (XFS_SB_FEAT_INCOMPAT_FTYPE)
+               (XFS_SB_FEAT_INCOMPAT_FTYPE|    \
+                XFS_SB_FEAT_INCOMPAT_SPINODES)
 
 #define XFS_SB_FEAT_INCOMPAT_UNKNOWN   ~XFS_SB_FEAT_INCOMPAT_ALL
 static inline bool
@@ -506,6 +508,12 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
                (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
 }
 
+static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp)
+{
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+               xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES);
+}
+
 /*
  * end of superblock version macros
  */
@@ -758,19 +766,6 @@ typedef struct xfs_agfl {
 
 #define XFS_AGFL_CRC_OFF       offsetof(struct xfs_agfl, agfl_crc)
 
-
-#define        XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
-#define        XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
-       (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
-#define        XFS_MIN_FREELIST(a,mp)          \
-       (XFS_MIN_FREELIST_RAW(          \
-               be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
-               be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
-#define        XFS_MIN_FREELIST_PAG(pag,mp)    \
-       (XFS_MIN_FREELIST_RAW(          \
-               (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
-               (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
-
 #define XFS_AGB_TO_FSB(mp,agno,agbno)  \
        (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
 #define        XFS_FSB_TO_AGNO(mp,fsbno)       \
@@ -1216,26 +1211,54 @@ typedef __uint64_t      xfs_inofree_t;
 #define        XFS_INOBT_ALL_FREE              ((xfs_inofree_t)-1)
 #define        XFS_INOBT_MASK(i)               ((xfs_inofree_t)1 << (i))
 
+#define XFS_INOBT_HOLEMASK_FULL                0       /* holemask for full chunk */
+#define XFS_INOBT_HOLEMASK_BITS                (NBBY * sizeof(__uint16_t))
+#define XFS_INODES_PER_HOLEMASK_BIT    \
+       (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t)))
+
 static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
 {
        return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
 }
 
 /*
- * Data record structure
+ * The on-disk inode record structure has two formats. The original "full"
+ * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount
+ * and replaces the 3 high-order freecount bytes wth the holemask and inode
+ * count.
+ *
+ * The holemask of the sparse record format allows an inode chunk to have holes
+ * that refer to blocks not owned by the inode record. This facilitates inode
+ * allocation in the event of severe free space fragmentation.
  */
 typedef struct xfs_inobt_rec {
        __be32          ir_startino;    /* starting inode number */
-       __be32          ir_freecount;   /* count of free inodes (set bits) */
+       union {
+               struct {
+                       __be32  ir_freecount;   /* count of free inodes */
+               } f;
+               struct {
+                       __be16  ir_holemask;/* hole mask for sparse chunks */
+                       __u8    ir_count;       /* total inode count */
+                       __u8    ir_freecount;   /* count of free inodes */
+               } sp;
+       } ir_u;
        __be64          ir_free;        /* free inode mask */
 } xfs_inobt_rec_t;
 
 typedef struct xfs_inobt_rec_incore {
        xfs_agino_t     ir_startino;    /* starting inode number */
-       __int32_t       ir_freecount;   /* count of free inodes (set bits) */
+       __uint16_t      ir_holemask;    /* hole mask for sparse chunks */
+       __uint8_t       ir_count;       /* total inode count */
+       __uint8_t       ir_freecount;   /* count of free inodes (set bits) */
        xfs_inofree_t   ir_free;        /* free inode mask */
 } xfs_inobt_rec_incore_t;
 
+static inline bool xfs_inobt_issparse(uint16_t holemask)
+{
+       /* non-zero holemask represents a sparse rec. */
+       return holemask;
+}
 
 /*
  * Key structure
@@ -1453,8 +1476,8 @@ struct xfs_acl {
                sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
 
 /* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE           (unsigned char *)"SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT                (unsigned char *)"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE           "SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT                "SGI_ACL_DEFAULT"
 #define SGI_ACL_FILE_SIZE      (sizeof(SGI_ACL_FILE)-1)
 #define SGI_ACL_DEFAULT_SIZE   (sizeof(SGI_ACL_DEFAULT)-1)
 
index 18dc721..89689c6 100644 (file)
@@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_V5SB       0x8000  /* version 5 superblock */
 #define XFS_FSOP_GEOM_FLAGS_FTYPE      0x10000 /* inode directory types */
 #define XFS_FSOP_GEOM_FLAGS_FINOBT     0x20000 /* free inode btree */
+#define XFS_FSOP_GEOM_FLAGS_SPINODES   0x40000 /* sparse inode chunks  */
 
 /*
  * Minimum and maximum sizes need for growth checks.
index 1c9e755..66efc70 100644 (file)
@@ -65,6 +65,8 @@ xfs_inobt_lookup(
        int                     *stat)  /* success/failure */
 {
        cur->bc_rec.i.ir_startino = ino;
+       cur->bc_rec.i.ir_holemask = 0;
+       cur->bc_rec.i.ir_count = 0;
        cur->bc_rec.i.ir_freecount = 0;
        cur->bc_rec.i.ir_free = 0;
        return xfs_btree_lookup(cur, dir, stat);
@@ -82,7 +84,14 @@ xfs_inobt_update(
        union xfs_btree_rec     rec;
 
        rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
-       rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+       if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+               rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
+               rec.inobt.ir_u.sp.ir_count = irec->ir_count;
+               rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
+       } else {
+               /* ir_holemask/ir_count not supported on-disk */
+               rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
+       }
        rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
        return xfs_btree_update(cur, &rec);
 }
@@ -100,12 +109,27 @@ xfs_inobt_get_rec(
        int                     error;
 
        error = xfs_btree_get_rec(cur, &rec, stat);
-       if (!error && *stat == 1) {
-               irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
-               irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
-               irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+       if (error || *stat == 0)
+               return error;
+
+       irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+       if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+               irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
+               irec->ir_count = rec->inobt.ir_u.sp.ir_count;
+               irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
+       } else {
+               /*
+                * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
+                * values for full inode chunks.
+                */
+               irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
+               irec->ir_count = XFS_INODES_PER_CHUNK;
+               irec->ir_freecount =
+                               be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
        }
-       return error;
+       irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+
+       return 0;
 }
 
 /*
@@ -114,10 +138,14 @@ xfs_inobt_get_rec(
 STATIC int
 xfs_inobt_insert_rec(
        struct xfs_btree_cur    *cur,
+       __uint16_t              holemask,
+       __uint8_t               count,
        __int32_t               freecount,
        xfs_inofree_t           free,
        int                     *stat)
 {
+       cur->bc_rec.i.ir_holemask = holemask;
+       cur->bc_rec.i.ir_count = count;
        cur->bc_rec.i.ir_freecount = freecount;
        cur->bc_rec.i.ir_free = free;
        return xfs_btree_insert(cur, stat);
@@ -154,7 +182,9 @@ xfs_inobt_insert(
                }
                ASSERT(i == 0);
 
-               error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+               error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
+                                            XFS_INODES_PER_CHUNK,
+                                            XFS_INODES_PER_CHUNK,
                                             XFS_INOBT_ALL_FREE, &i);
                if (error) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -220,6 +250,7 @@ xfs_ialloc_inode_init(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
        struct list_head        *buffer_list,
+       int                     icount,
        xfs_agnumber_t          agno,
        xfs_agblock_t           agbno,
        xfs_agblock_t           length,
@@ -275,7 +306,7 @@ xfs_ialloc_inode_init(
                 * they track in the AIL as if they were physically logged.
                 */
                if (tp)
-                       xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
+                       xfs_icreate_log(tp, agno, agbno, icount,
                                        mp->m_sb.sb_inodesize, length, gen);
        } else
                version = 2;
@@ -346,6 +377,214 @@ xfs_ialloc_inode_init(
        return 0;
 }
 
+/*
+ * Align startino and allocmask for a recently allocated sparse chunk such that
+ * they are fit for insertion (or merge) into the on-disk inode btrees.
+ *
+ * Background:
+ *
+ * When enabled, sparse inode support increases the inode alignment from cluster
+ * size to inode chunk size. This means that the minimum range between two
+ * non-adjacent inode records in the inobt is large enough for a full inode
+ * record. This allows for cluster sized, cluster aligned block allocation
+ * without need to worry about whether the resulting inode record overlaps with
+ * another record in the tree. Without this basic rule, we would have to deal
+ * with the consequences of overlap by potentially undoing recent allocations in
+ * the inode allocation codepath.
+ *
+ * Because of this alignment rule (which is enforced on mount), there are two
+ * inobt possibilities for newly allocated sparse chunks. One is that the
+ * aligned inode record for the chunk covers a range of inodes not already
+ * covered in the inobt (i.e., it is safe to insert a new sparse record). The
+ * other is that a record already exists at the aligned startino that considers
+ * the newly allocated range as sparse. In the latter case, record content is
+ * merged in hope that sparse inode chunks fill to full chunks over time.
+ */
+STATIC void
+xfs_align_sparse_ino(
+       struct xfs_mount                *mp,
+       xfs_agino_t                     *startino,
+       uint16_t                        *allocmask)
+{
+       xfs_agblock_t                   agbno;
+       xfs_agblock_t                   mod;
+       int                             offset;
+
+       agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
+       mod = agbno % mp->m_sb.sb_inoalignmt;
+       if (!mod)
+               return;
+
+       /* calculate the inode offset and align startino */
+       offset = mod << mp->m_sb.sb_inopblog;
+       *startino -= offset;
+
+       /*
+        * Since startino has been aligned down, left shift allocmask such that
+        * it continues to represent the same physical inodes relative to the
+        * new startino.
+        */
+       *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
+}
+
+/*
+ * Determine whether the source inode record can merge into the target. Both
+ * records must be sparse, the inode ranges must match and there must be no
+ * allocation overlap between the records.
+ */
+STATIC bool
+__xfs_inobt_can_merge(
+       struct xfs_inobt_rec_incore     *trec,  /* tgt record */
+       struct xfs_inobt_rec_incore     *srec)  /* src record */
+{
+       uint64_t                        talloc;
+       uint64_t                        salloc;
+
+       /* records must cover the same inode range */
+       if (trec->ir_startino != srec->ir_startino)
+               return false;
+
+       /* both records must be sparse */
+       if (!xfs_inobt_issparse(trec->ir_holemask) ||
+           !xfs_inobt_issparse(srec->ir_holemask))
+               return false;
+
+       /* both records must track some inodes */
+       if (!trec->ir_count || !srec->ir_count)
+               return false;
+
+       /* can't exceed capacity of a full record */
+       if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
+               return false;
+
+       /* verify there is no allocation overlap */
+       talloc = xfs_inobt_irec_to_allocmask(trec);
+       salloc = xfs_inobt_irec_to_allocmask(srec);
+       if (talloc & salloc)
+               return false;
+
+       return true;
+}
+
+/*
+ * Merge the source inode record into the target. The caller must call
+ * __xfs_inobt_can_merge() to ensure the merge is valid.
+ */
+STATIC void
+__xfs_inobt_rec_merge(
+       struct xfs_inobt_rec_incore     *trec,  /* target */
+       struct xfs_inobt_rec_incore     *srec)  /* src */
+{
+       ASSERT(trec->ir_startino == srec->ir_startino);
+
+       /* combine the counts */
+       trec->ir_count += srec->ir_count;
+       trec->ir_freecount += srec->ir_freecount;
+
+       /*
+        * Merge the holemask and free mask. For both fields, 0 bits refer to
+        * allocated inodes. We combine the allocated ranges with bitwise AND.
+        */
+       trec->ir_holemask &= srec->ir_holemask;
+       trec->ir_free &= srec->ir_free;
+}
+
+/*
+ * Insert a new sparse inode chunk into the associated inode btree. The inode
+ * record for the sparse chunk is pre-aligned to a startino that should match
+ * any pre-existing sparse inode record in the tree. This allows sparse chunks
+ * to fill over time.
+ *
+ * This function supports two modes of handling preexisting records depending on
+ * the merge flag. If merge is true, the provided record is merged with the
+ * existing record and updated in place. The merged record is returned in nrec.
+ * If merge is false, an existing record is replaced with the provided record.
+ * If no preexisting record exists, the provided record is always inserted.
+ *
+ * It is considered corruption if a merge is requested and not possible. Given
+ * the sparse inode alignment constraints, this should never happen.
+ */
+STATIC int
+xfs_inobt_insert_sprec(
+       struct xfs_mount                *mp,
+       struct xfs_trans                *tp,
+       struct xfs_buf                  *agbp,
+       int                             btnum,
+       struct xfs_inobt_rec_incore     *nrec,  /* in/out: new/merged rec. */
+       bool                            merge)  /* merge or replace */
+{
+       struct xfs_btree_cur            *cur;
+       struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
+       xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
+       int                             error;
+       int                             i;
+       struct xfs_inobt_rec_incore     rec;
+
+       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+       /* the new record is pre-aligned so we know where to look */
+       error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
+       if (error)
+               goto error;
+       /* if nothing there, insert a new record and return */
+       if (i == 0) {
+               error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
+                                            nrec->ir_count, nrec->ir_freecount,
+                                            nrec->ir_free, &i);
+               if (error)
+                       goto error;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+
+               goto out;
+       }
+
+       /*
+        * A record exists at this startino. Merge or replace the record
+        * depending on what we've been asked to do.
+        */
+       if (merge) {
+               error = xfs_inobt_get_rec(cur, &rec, &i);
+               if (error)
+                       goto error;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+               XFS_WANT_CORRUPTED_GOTO(mp,
+                                       rec.ir_startino == nrec->ir_startino,
+                                       error);
+
+               /*
+                * This should never fail. If we have coexisting records that
+                * cannot merge, something is seriously wrong.
+                */
+               XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
+                                       error);
+
+               trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
+                                        rec.ir_holemask, nrec->ir_startino,
+                                        nrec->ir_holemask);
+
+               /* merge to nrec to output the updated record */
+               __xfs_inobt_rec_merge(nrec, &rec);
+
+               trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
+                                         nrec->ir_holemask);
+
+               error = xfs_inobt_rec_check_count(mp, nrec);
+               if (error)
+                       goto error;
+       }
+
+       error = xfs_inobt_update(cur, nrec);
+       if (error)
+               goto error;
+
+out:
+       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+       return 0;
+error:
+       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+       return error;
+}
+
 /*
  * Allocate new inodes in the allocation group specified by agbp.
  * Return 0 for success, else error code.
@@ -364,11 +603,22 @@ xfs_ialloc_ag_alloc(
        xfs_agino_t     newlen;         /* new number of inodes */
        int             isaligned = 0;  /* inode allocation at stripe unit */
                                        /* boundary */
+       uint16_t        allocmask = (uint16_t) -1; /* init. to full chunk */
+       struct xfs_inobt_rec_incore rec;
        struct xfs_perag *pag;
+       int             do_sparse = 0;
 
        memset(&args, 0, sizeof(args));
        args.tp = tp;
        args.mp = tp->t_mountp;
+       args.fsbno = NULLFSBLOCK;
+
+#ifdef DEBUG
+       /* randomly do sparse inode allocations */
+       if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
+           args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks)
+               do_sparse = prandom_u32() & 1;
+#endif
 
        /*
         * Locking will ensure that we don't have two callers in here
@@ -390,6 +640,8 @@ xfs_ialloc_ag_alloc(
        agno = be32_to_cpu(agi->agi_seqno);
        args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
                     args.mp->m_ialloc_blks;
+       if (do_sparse)
+               goto sparse_alloc;
        if (likely(newino != NULLAGINO &&
                  (args.agbno < be32_to_cpu(agi->agi_length)))) {
                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -428,8 +680,7 @@ xfs_ialloc_ag_alloc(
                 * subsequent requests.
                 */
                args.minalignslop = 0;
-       } else
-               args.fsbno = NULLFSBLOCK;
+       }
 
        if (unlikely(args.fsbno == NULLFSBLOCK)) {
                /*
@@ -480,6 +731,47 @@ xfs_ialloc_ag_alloc(
                        return error;
        }
 
+       /*
+        * Finally, try a sparse allocation if the filesystem supports it and
+        * the sparse allocation length is smaller than a full chunk.
+        */
+       if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
+           args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
+           args.fsbno == NULLFSBLOCK) {
+sparse_alloc:
+               args.type = XFS_ALLOCTYPE_NEAR_BNO;
+               args.agbno = be32_to_cpu(agi->agi_root);
+               args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+               args.alignment = args.mp->m_sb.sb_spino_align;
+               args.prod = 1;
+
+               args.minlen = args.mp->m_ialloc_min_blks;
+               args.maxlen = args.minlen;
+
+               /*
+                * The inode record will be aligned to full chunk size. We must
+                * prevent sparse allocation from AG boundaries that result in
+                * invalid inode records, such as records that start at agbno 0
+                * or extend beyond the AG.
+                *
+                * Set min agbno to the first aligned, non-zero agbno and max to
+                * the last aligned agbno that is at least one full chunk from
+                * the end of the AG.
+                */
+               args.min_agbno = args.mp->m_sb.sb_inoalignmt;
+               args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
+                                           args.mp->m_sb.sb_inoalignmt) -
+                                args.mp->m_ialloc_blks;
+
+               error = xfs_alloc_vextent(&args);
+               if (error)
+                       return error;
+
+               newlen = args.len << args.mp->m_sb.sb_inopblog;
+               ASSERT(newlen <= XFS_INODES_PER_CHUNK);
+               allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
+       }
+
        if (args.fsbno == NULLFSBLOCK) {
                *alloc = 0;
                return 0;
@@ -495,8 +787,8 @@ xfs_ialloc_ag_alloc(
         * rather than a linear progression to prevent the next generation
         * number from being easily guessable.
         */
-       error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
-                       args.len, prandom_u32());
+       error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno,
+                       args.agbno, args.len, prandom_u32());
 
        if (error)
                return error;
@@ -504,6 +796,73 @@ xfs_ialloc_ag_alloc(
         * Convert the results.
         */
        newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+
+       if (xfs_inobt_issparse(~allocmask)) {
+               /*
+                * We've allocated a sparse chunk. Align the startino and mask.
+                */
+               xfs_align_sparse_ino(args.mp, &newino, &allocmask);
+
+               rec.ir_startino = newino;
+               rec.ir_holemask = ~allocmask;
+               rec.ir_count = newlen;
+               rec.ir_freecount = newlen;
+               rec.ir_free = XFS_INOBT_ALL_FREE;
+
+               /*
+                * Insert the sparse record into the inobt and allow for a merge
+                * if necessary. If a merge does occur, rec is updated to the
+                * merged record.
+                */
+               error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
+                                              &rec, true);
+               if (error == -EFSCORRUPTED) {
+                       xfs_alert(args.mp,
+       "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
+                                 XFS_AGINO_TO_INO(args.mp, agno,
+                                                  rec.ir_startino),
+                                 rec.ir_holemask, rec.ir_count);
+                       xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
+               }
+               if (error)
+                       return error;
+
+               /*
+                * We can't merge the part we've just allocated as for the inobt
+                * due to finobt semantics. The original record may or may not
+                * exist independent of whether physical inodes exist in this
+                * sparse chunk.
+                *
+                * We must update the finobt record based on the inobt record.
+                * rec contains the fully merged and up to date inobt record
+                * from the previous call. Set merge false to replace any
+                * existing record with this one.
+                */
+               if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+                       error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
+                                                      XFS_BTNUM_FINO, &rec,
+                                                      false);
+                       if (error)
+                               return error;
+               }
+       } else {
+               /* full chunk - insert new records to both btrees */
+               error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+                                        XFS_BTNUM_INO);
+               if (error)
+                       return error;
+
+               if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+                       error = xfs_inobt_insert(args.mp, tp, agbp, newino,
+                                                newlen, XFS_BTNUM_FINO);
+                       if (error)
+                               return error;
+               }
+       }
+
+       /*
+        * Update AGI counts and newino.
+        */
        be32_add_cpu(&agi->agi_count, newlen);
        be32_add_cpu(&agi->agi_freecount, newlen);
        pag = xfs_perag_get(args.mp, agno);
@@ -511,20 +870,6 @@ xfs_ialloc_ag_alloc(
        xfs_perag_put(pag);
        agi->agi_newino = cpu_to_be32(newino);
 
-       /*
-        * Insert records describing the new inode chunk into the btrees.
-        */
-       error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
-                                XFS_BTNUM_INO);
-       if (error)
-               return error;
-
-       if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
-               error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
-                                        XFS_BTNUM_FINO);
-               if (error)
-                       return error;
-       }
        /*
         * Log allocation group header fields
         */
@@ -645,7 +990,7 @@ xfs_ialloc_ag_select(
                 * if we fail allocation due to alignment issues then it is most
                 * likely a real ENOSPC condition.
                 */
-               ineed = mp->m_ialloc_blks;
+               ineed = mp->m_ialloc_min_blks;
                if (flags && ineed > 1)
                        ineed += xfs_ialloc_cluster_alignment(mp);
                longest = pag->pagf_longest;
@@ -731,6 +1076,27 @@ xfs_ialloc_get_rec(
        return 0;
 }
 
+/*
+ * Return the offset of the first free inode in the record. If the inode chunk
+ * is sparsely allocated, we convert the record holemask to inode granularity
+ * and mask off the unallocated regions from the inode free mask.
+ */
+STATIC int
+xfs_inobt_first_free_inode(
+       struct xfs_inobt_rec_incore     *rec)
+{
+       xfs_inofree_t                   realfree;
+
+       /* if there are no holes, return the first available offset */
+       if (!xfs_inobt_issparse(rec->ir_holemask))
+               return xfs_lowbit64(rec->ir_free);
+
+       realfree = xfs_inobt_irec_to_allocmask(rec);
+       realfree &= rec->ir_free;
+
+       return xfs_lowbit64(realfree);
+}
+
 /*
  * Allocate an inode using the inobt-only algorithm.
  */
@@ -961,7 +1327,7 @@ newino:
        }
 
 alloc_inode:
-       offset = xfs_lowbit64(rec.ir_free);
+       offset = xfs_inobt_first_free_inode(&rec);
        ASSERT(offset >= 0);
        ASSERT(offset < XFS_INODES_PER_CHUNK);
        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1210,7 +1576,7 @@ xfs_dialloc_ag(
        if (error)
                goto error_cur;
 
-       offset = xfs_lowbit64(rec.ir_free);
+       offset = xfs_inobt_first_free_inode(&rec);
        ASSERT(offset >= 0);
        ASSERT(offset < XFS_INODES_PER_CHUNK);
        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1439,6 +1805,83 @@ out_error:
        return error;
 }
 
+/*
+ * Free the blocks of an inode chunk. We must consider that the inode chunk
+ * might be sparse and only free the regions that are allocated as part of the
+ * chunk.
+ */
+STATIC void
+xfs_difree_inode_chunk(
+       struct xfs_mount                *mp,
+       xfs_agnumber_t                  agno,
+       struct xfs_inobt_rec_incore     *rec,
+       struct xfs_bmap_free            *flist)
+{
+       xfs_agblock_t   sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
+       int             startidx, endidx;
+       int             nextbit;
+       xfs_agblock_t   agbno;
+       int             contigblk;
+       DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
+
+       if (!xfs_inobt_issparse(rec->ir_holemask)) {
+               /* not sparse, calculate extent info directly */
+               xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+                                 XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
+                                 mp->m_ialloc_blks, flist, mp);
+               return;
+       }
+
+       /* holemask is only 16-bits (fits in an unsigned long) */
+       ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
+       holemask[0] = rec->ir_holemask;
+
+       /*
+        * Find contiguous ranges of zeroes (i.e., allocated regions) in the
+        * holemask and convert the start/end index of each range to an extent.
+        * We start with the start and end index both pointing at the first 0 in
+        * the mask.
+        */
+       startidx = endidx = find_first_zero_bit(holemask,
+                                               XFS_INOBT_HOLEMASK_BITS);
+       nextbit = startidx + 1;
+       while (startidx < XFS_INOBT_HOLEMASK_BITS) {
+               nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
+                                            nextbit);
+               /*
+                * If the next zero bit is contiguous, update the end index of
+                * the current range and continue.
+                */
+               if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
+                   nextbit == endidx + 1) {
+                       endidx = nextbit;
+                       goto next;
+               }
+
+               /*
+                * nextbit is not contiguous with the current end index. Convert
+                * the current start/end to an extent and add it to the free
+                * list.
+                */
+               agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
+                                 mp->m_sb.sb_inopblock;
+               contigblk = ((endidx - startidx + 1) *
+                            XFS_INODES_PER_HOLEMASK_BIT) /
+                           mp->m_sb.sb_inopblock;
+
+               ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
+               ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
+               xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
+                                 flist, mp);
+
+               /* reset range to current bit and carry on... */
+               startidx = endidx = nextbit;
+
+next:
+               nextbit++;
+       }
+}
+
 STATIC int
 xfs_difree_inobt(
        struct xfs_mount                *mp,
@@ -1446,8 +1889,7 @@ xfs_difree_inobt(
        struct xfs_buf                  *agbp,
        xfs_agino_t                     agino,
        struct xfs_bmap_free            *flist,
-       int                             *deleted,
-       xfs_ino_t                       *first_ino,
+       struct xfs_icluster             *xic,
        struct xfs_inobt_rec_incore     *orec)
 {
        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
@@ -1501,20 +1943,23 @@ xfs_difree_inobt(
        rec.ir_freecount++;
 
        /*
-        * When an inode cluster is free, it becomes eligible for removal
+        * When an inode chunk is free, it becomes eligible for removal. Don't
+        * remove the chunk if the block size is large enough for multiple inode
+        * chunks (that might not be free).
         */
        if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
-           (rec.ir_freecount == mp->m_ialloc_inos)) {
-
-               *deleted = 1;
-               *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+           rec.ir_free == XFS_INOBT_ALL_FREE &&
+           mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
+               xic->deleted = 1;
+               xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+               xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
 
                /*
                 * Remove the inode cluster from the AGI B+Tree, adjust the
                 * AGI and Superblock inode counts, and mark the disk space
                 * to be freed when the transaction is committed.
                 */
-               ilen = mp->m_ialloc_inos;
+               ilen = rec.ir_freecount;
                be32_add_cpu(&agi->agi_count, -ilen);
                be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1530,11 +1975,9 @@ xfs_difree_inobt(
                        goto error0;
                }
 
-               xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
-                                 XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
-                                 mp->m_ialloc_blks, flist, mp);
+               xfs_difree_inode_chunk(mp, agno, &rec, flist);
        } else {
-               *deleted = 0;
+               xic->deleted = 0;
 
                error = xfs_inobt_update(cur, &rec);
                if (error) {
@@ -1599,7 +2042,9 @@ xfs_difree_finobt(
                 */
                XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
 
-               error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+               error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
+                                            ibtrec->ir_count,
+                                            ibtrec->ir_freecount,
                                             ibtrec->ir_free, &i);
                if (error)
                        goto error;
@@ -1634,8 +2079,13 @@ xfs_difree_finobt(
         * free inode. Hence, if all of the inodes are free and we aren't
         * keeping inode chunks permanently on disk, remove the record.
         * Otherwise, update the record with the new information.
+        *
+        * Note that we currently can't free chunks when the block size is large
+        * enough for multiple chunks. Leave the finobt record to remain in sync
+        * with the inobt.
         */
-       if (rec.ir_freecount == mp->m_ialloc_inos &&
+       if (rec.ir_free == XFS_INOBT_ALL_FREE &&
+           mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK &&
            !(mp->m_flags & XFS_MOUNT_IKEEP)) {
                error = xfs_btree_delete(cur, &i);
                if (error)
@@ -1671,8 +2121,7 @@ xfs_difree(
        struct xfs_trans        *tp,            /* transaction pointer */
        xfs_ino_t               inode,          /* inode to be freed */
        struct xfs_bmap_free    *flist,         /* extents to free */
-       int                     *deleted,/* set if inode cluster was deleted */
-       xfs_ino_t               *first_ino)/* first inode in deleted cluster */
+       struct xfs_icluster     *xic)   /* cluster info if deleted */
 {
        /* REFERENCED */
        xfs_agblock_t           agbno;  /* block number containing inode */
@@ -1723,8 +2172,7 @@ xfs_difree(
        /*
         * Fix up the inode allocation btree.
         */
-       error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
-                                &rec);
+       error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec);
        if (error)
                goto error0;
 
index 100007d..6e450df 100644 (file)
@@ -28,6 +28,13 @@ struct xfs_btree_cur;
 /* Move inodes in clusters of this size */
 #define        XFS_INODE_BIG_CLUSTER_SIZE      8192
 
+struct xfs_icluster {
+       bool            deleted;        /* record is deleted */
+       xfs_ino_t       first_ino;      /* first inode number */
+       uint64_t        alloc;          /* inode phys. allocation bitmap for
+                                        * sparse chunks */
+};
+
 /* Calculate and return the number of filesystem blocks per inode cluster */
 static inline int
 xfs_icluster_size_fsb(
@@ -44,8 +51,7 @@ xfs_icluster_size_fsb(
 static inline struct xfs_dinode *
 xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
 {
-       return (struct xfs_dinode *)
-               (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
+       return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog);
 }
 
 /*
@@ -90,8 +96,7 @@ xfs_difree(
        struct xfs_trans *tp,           /* transaction pointer */
        xfs_ino_t       inode,          /* inode to be freed */
        struct xfs_bmap_free *flist,    /* extents to free */
-       int             *deleted,       /* set if inode cluster was deleted */
-       xfs_ino_t       *first_ino);    /* first inode in deleted cluster */
+       struct xfs_icluster *ifree);    /* cluster info if deleted */
 
 /*
  * Return the location of the inode in imap, for mapping it into a buffer.
@@ -156,7 +161,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
  * Inode chunk initialisation routine
  */
 int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
-                         struct list_head *buffer_list,
+                         struct list_head *buffer_list, int icount,
                          xfs_agnumber_t agno, xfs_agblock_t agbno,
                          xfs_agblock_t length, unsigned int gen);
 
index 964c465..674ad8f 100644 (file)
@@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur(
        union xfs_btree_rec     *rec)
 {
        rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
-       rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+       if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+               rec->inobt.ir_u.sp.ir_holemask =
+                                       cpu_to_be16(cur->bc_rec.i.ir_holemask);
+               rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count;
+               rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount;
+       } else {
+               /* ir_holemask/ir_count not supported on-disk */
+               rec->inobt.ir_u.f.ir_freecount =
+                                       cpu_to_be32(cur->bc_rec.i.ir_freecount);
+       }
        rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
 }
 
@@ -418,3 +427,85 @@ xfs_inobt_maxrecs(
                return blocklen / sizeof(xfs_inobt_rec_t);
        return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
 }
+
+/*
+ * Convert the inode record holemask to an inode allocation bitmap. The inode
+ * allocation bitmap is inode granularity and specifies whether an inode is
+ * physically allocated on disk (not whether the inode is considered allocated
+ * or free by the fs).
+ *
+ * A bit value of 1 means the inode is allocated, a value of 0 means it is free.
+ */
+uint64_t
+xfs_inobt_irec_to_allocmask(
+       struct xfs_inobt_rec_incore     *rec)
+{
+       uint64_t                        bitmap = 0;
+       uint64_t                        inodespbit;
+       int                             nextbit;
+       uint                            allocbitmap;
+
+       /*
+        * The holemask has 16-bits for a 64 inode record. Therefore each
+        * holemask bit represents multiple inodes. Create a mask of bits to set
+        * in the allocmask for each holemask bit.
+        */
+       inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1;
+
+       /*
+        * Allocated inodes are represented by 0 bits in holemask. Invert the 0
+        * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask
+        * anything beyond the 16 holemask bits since this casts to a larger
+        * type.
+        */
+       allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1);
+
+       /*
+        * allocbitmap is the inverted holemask so every set bit represents
+        * allocated inodes. To expand from 16-bit holemask granularity to
+        * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target
+        * bitmap for every holemask bit.
+        */
+       nextbit = xfs_next_bit(&allocbitmap, 1, 0);
+       while (nextbit != -1) {
+               ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY));
+
+               bitmap |= (inodespbit <<
+                          (nextbit * XFS_INODES_PER_HOLEMASK_BIT));
+
+               nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1);
+       }
+
+       return bitmap;
+}
+
+#if defined(DEBUG) || defined(XFS_WARN)
+/*
+ * Verify that an in-core inode record has a valid inode count.
+ */
+int
+xfs_inobt_rec_check_count(
+       struct xfs_mount                *mp,
+       struct xfs_inobt_rec_incore     *rec)
+{
+       int                             inocount = 0;
+       int                             nextbit = 0;
+       uint64_t                        allocbmap;
+       int                             wordsz;
+
+       wordsz = sizeof(allocbmap) / sizeof(unsigned int);
+       allocbmap = xfs_inobt_irec_to_allocmask(rec);
+
+       nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit);
+       while (nextbit != -1) {
+               inocount++;
+               nextbit = xfs_next_bit((uint *) &allocbmap, wordsz,
+                                      nextbit + 1);
+       }
+
+       if (inocount != rec->ir_count)
+               return -EFSCORRUPTED;
+
+       return 0;
+}
+#endif /* DEBUG */
index d7ebea7..bd88453 100644 (file)
@@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
                xfs_btnum_t);
 extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
 
+/* ir_holemask to inode allocation bitmap conversion */
+uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *);
+
+#if defined(DEBUG) || defined(XFS_WARN)
+int xfs_inobt_rec_check_count(struct xfs_mount *,
+                             struct xfs_inobt_rec_incore *);
+#else
+#define xfs_inobt_rec_check_count(mp, rec)     0
+#endif /* DEBUG */
+
 #endif /* __XFS_IALLOC_BTREE_H__ */
index 002b6b3..6526e76 100644 (file)
@@ -46,8 +46,7 @@ xfs_inobp_check(
        j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
 
        for (i = 0; i < j; i++) {
-               dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-                                       i * mp->m_sb.sb_inodesize);
+               dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
                if (!dip->di_next_unlinked)  {
                        xfs_alert(mp,
        "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
@@ -86,8 +85,7 @@ xfs_inode_buf_verify(
                int             di_ok;
                xfs_dinode_t    *dip;
 
-               dip = (struct xfs_dinode *)xfs_buf_offset(bp,
-                                       (i << mp->m_sb.sb_inodelog));
+               dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
                di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
                            XFS_DINODE_GOOD_VERSION(dip->di_version);
                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
@@ -186,7 +184,7 @@ xfs_imap_to_bp(
        }
 
        *bpp = bp;
-       *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
+       *dipp = xfs_buf_offset(bp, imap->im_boffset);
        return 0;
 }
 
index dc4bfc5..df9851c 100644 (file)
@@ -174,6 +174,27 @@ xfs_mount_validate_sb(
                        return -EFSCORRUPTED;
        }
 
+       /*
+        * Full inode chunks must be aligned to inode chunk size when
+        * sparse inodes are enabled to support the sparse chunk
+        * allocation algorithm and prevent overlapping inode records.
+        */
+       if (xfs_sb_version_hassparseinodes(sbp)) {
+               uint32_t        align;
+
+               xfs_alert(mp,
+       "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
+
+               align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
+                               >> sbp->sb_blocklog;
+               if (sbp->sb_inoalignmt != align) {
+                       xfs_warn(mp,
+"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
+                                sbp->sb_inoalignmt, align);
+                       return -EINVAL;
+               }
+       }
+
        if (unlikely(
            sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
                xfs_warn(mp,
@@ -374,7 +395,7 @@ __xfs_sb_from_disk(
                                be32_to_cpu(from->sb_features_log_incompat);
        /* crc is only used on disk, not in memory; just init to 0 here. */
        to->sb_crc = 0;
-       to->sb_pad = 0;
+       to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
        to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
        to->sb_lsn = be64_to_cpu(from->sb_lsn);
        /* Convert on-disk flags to in-memory flags? */
@@ -516,7 +537,7 @@ xfs_sb_to_disk(
                                cpu_to_be32(from->sb_features_incompat);
                to->sb_features_log_incompat =
                                cpu_to_be32(from->sb_features_log_incompat);
-               to->sb_pad = 0;
+               to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
                to->sb_lsn = cpu_to_be64(from->sb_lsn);
        }
 }
@@ -689,6 +710,11 @@ xfs_sb_mount_common(
        mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
                                        sbp->sb_inopblock);
        mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+
+       if (sbp->sb_spino_align)
+               mp->m_ialloc_min_blks = sbp->sb_spino_align;
+       else
+               mp->m_ialloc_min_blks = mp->m_ialloc_blks;
 }
 
 /*
@@ -792,12 +818,12 @@ xfs_sync_sb(
        tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
 
        xfs_log_sb(tp);
        if (wait)
                xfs_trans_set_sync(tp);
-       return xfs_trans_commit(tp, 0);
+       return xfs_trans_commit(tp);
 }
index 8dda4b3..5be5297 100644 (file)
@@ -181,12 +181,6 @@ int        xfs_log_calc_minimum_size(struct xfs_mount *);
 #define XFS_TRANS_RESERVE      0x20    /* OK to use reserved data blocks */
 #define XFS_TRANS_FREEZE_PROT  0x40    /* Transaction has elevated writer
                                           count in superblock */
-/*
- * Values for call flags parameter.
- */
-#define        XFS_TRANS_RELEASE_LOG_RES       0x4
-#define        XFS_TRANS_ABORT                 0x8
-
 /*
  * Field values for xfs_trans_mod_sb.
  */
index 2d5bdfc..7978150 100644 (file)
@@ -73,9 +73,9 @@ struct xfs_trans_resv {
  * 2 trees * (2 blocks/level * max depth - 1) * block size
  */
 #define        XFS_ALLOCFREE_LOG_RES(mp,nx) \
-       ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+       ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1)))
 #define        XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
-       ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+       ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1)))
 
 /*
  * Per-directory log reservation for any directory change.
index bf9c457..41e0428 100644 (file)
@@ -67,7 +67,7 @@
 #define        XFS_DIOSTRAT_SPACE_RES(mp, v)   \
        (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
 #define        XFS_GROWFS_SPACE_RES(mp)        \
-       (2 * XFS_AG_MAXLEVELS(mp))
+       (2 * (mp)->m_ag_maxlevels)
 #define        XFS_GROWFSRT_SPACE_RES(mp,b)    \
        ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
 #define        XFS_LINK_SPACE_RES(mp,nl)       \
index e5099f2..3859f5e 100644 (file)
@@ -109,7 +109,7 @@ xfs_setfilesize_trans_alloc(
 
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -145,7 +145,7 @@ xfs_setfilesize(
        isize = xfs_new_eof(ip, offset + size);
        if (!isize) {
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return 0;
        }
 
@@ -155,7 +155,7 @@ xfs_setfilesize(
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-       return xfs_trans_commit(tp, 0);
+       return xfs_trans_commit(tp);
 }
 
 STATIC int
@@ -1348,7 +1348,7 @@ __xfs_get_blocks(
        sector_t                iblock,
        struct buffer_head      *bh_result,
        int                     create,
-       int                     direct)
+       bool                    direct)
 {
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
@@ -1413,6 +1413,7 @@ __xfs_get_blocks(
                        if (error)
                                return error;
                        new = 1;
+
                } else {
                        /*
                         * Delalloc reservations do not require a transaction,
@@ -1507,49 +1508,29 @@ xfs_get_blocks(
        struct buffer_head      *bh_result,
        int                     create)
 {
-       return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+       return __xfs_get_blocks(inode, iblock, bh_result, create, false);
 }
 
-STATIC int
+int
 xfs_get_blocks_direct(
        struct inode            *inode,
        sector_t                iblock,
        struct buffer_head      *bh_result,
        int                     create)
 {
-       return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+       return __xfs_get_blocks(inode, iblock, bh_result, create, true);
 }
 
-/*
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC void
-xfs_end_io_direct_write(
-       struct kiocb            *iocb,
+static void
+__xfs_end_io_direct_write(
+       struct inode            *inode,
+       struct xfs_ioend        *ioend,
        loff_t                  offset,
-       ssize_t                 size,
-       void                    *private)
+       ssize_t                 size)
 {
-       struct inode            *inode = file_inode(iocb->ki_filp);
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_ioend        *ioend = private;
-
-       trace_xfs_gbmap_direct_endio(ip, offset, size,
-                                    ioend ? ioend->io_type : 0, NULL);
+       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
 
-       if (!ioend) {
-               ASSERT(offset + size <= i_size_read(inode));
-               return;
-       }
-
-       if (XFS_FORCED_SHUTDOWN(mp))
+       if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
                goto out_end_io;
 
        /*
@@ -1586,10 +1567,10 @@ xfs_end_io_direct_write(
         * here can result in EOF moving backwards and Bad Things Happen when
         * that occurs.
         */
-       spin_lock(&ip->i_flags_lock);
+       spin_lock(&XFS_I(inode)->i_flags_lock);
        if (offset + size > i_size_read(inode))
                i_size_write(inode, offset + size);
-       spin_unlock(&ip->i_flags_lock);
+       spin_unlock(&XFS_I(inode)->i_flags_lock);
 
        /*
         * If we are doing an append IO that needs to update the EOF on disk,
@@ -1606,6 +1587,98 @@ out_end_io:
        return;
 }
 
+/*
+ * Complete a direct I/O write request.
+ *
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
+ */
+STATIC void
+xfs_end_io_direct_write(
+       struct kiocb            *iocb,
+       loff_t                  offset,
+       ssize_t                 size,
+       void                    *private)
+{
+       struct inode            *inode = file_inode(iocb->ki_filp);
+       struct xfs_ioend        *ioend = private;
+
+       trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
+                                    ioend ? ioend->io_type : 0, NULL);
+
+       if (!ioend) {
+               ASSERT(offset + size <= i_size_read(inode));
+               return;
+       }
+
+       __xfs_end_io_direct_write(inode, ioend, offset, size);
+}
+
+/*
+ * For DAX we need a mapping buffer callback for unwritten extent conversion
+ * when page faults allocate blocks and then zero them. Note that in this
+ * case the mapping indicated by the ioend may extend beyond EOF. We most
+ * definitely do not want to extend EOF here, so we trim back the ioend size to
+ * EOF.
+ */
+#ifdef CONFIG_FS_DAX
+void
+xfs_end_io_dax_write(
+       struct buffer_head      *bh,
+       int                     uptodate)
+{
+       struct xfs_ioend        *ioend = bh->b_private;
+       struct inode            *inode = ioend->io_inode;
+       ssize_t                 size = ioend->io_size;
+
+       ASSERT(IS_DAX(ioend->io_inode));
+
+       /* if there was an error zeroing, then don't convert it */
+       if (!uptodate)
+               ioend->io_error = -EIO;
+
+       /*
+        * Trim update to EOF, so we don't extend EOF during unwritten extent
+        * conversion of partial EOF blocks.
+        */
+       spin_lock(&XFS_I(inode)->i_flags_lock);
+       if (ioend->io_offset + size > i_size_read(inode))
+               size = i_size_read(inode) - ioend->io_offset;
+       spin_unlock(&XFS_I(inode)->i_flags_lock);
+
+       __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
+
+}
+#else
+void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
+#endif
+
+static inline ssize_t
+xfs_vm_do_dio(
+       struct inode            *inode,
+       struct kiocb            *iocb,
+       struct iov_iter         *iter,
+       loff_t                  offset,
+       void                    (*endio)(struct kiocb   *iocb,
+                                        loff_t         offset,
+                                        ssize_t        size,
+                                        void           *private),
+       int                     flags)
+{
+       struct block_device     *bdev;
+
+       if (IS_DAX(inode))
+               return dax_do_io(iocb, inode, iter, offset,
+                                xfs_get_blocks_direct, endio, 0);
+
+       bdev = xfs_find_bdev_for_inode(inode);
+       return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
+                                    xfs_get_blocks_direct, endio, NULL, flags);
+}
+
 STATIC ssize_t
 xfs_vm_direct_IO(
        struct kiocb            *iocb,
@@ -1613,16 +1686,11 @@ xfs_vm_direct_IO(
        loff_t                  offset)
 {
        struct inode            *inode = iocb->ki_filp->f_mapping->host;
-       struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
 
-       if (iov_iter_rw(iter) == WRITE) {
-               return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-                                           xfs_get_blocks_direct,
-                                           xfs_end_io_direct_write, NULL,
-                                           DIO_ASYNC_EXTEND);
-       }
-       return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-                                   xfs_get_blocks_direct, NULL, NULL, 0);
+       if (iov_iter_rw(iter) == WRITE)
+               return xfs_vm_do_dio(inode, iocb, iter, offset,
+                                    xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
+       return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
 }
 
 /*
index ac644e0..86afd1a 100644 (file)
@@ -53,7 +53,12 @@ typedef struct xfs_ioend {
 } xfs_ioend_t;
 
 extern const struct address_space_operations xfs_address_space_operations;
-extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+
+int    xfs_get_blocks(struct inode *inode, sector_t offset,
+                      struct buffer_head *map_bh, int create);
+int    xfs_get_blocks_direct(struct inode *inode, sector_t offset,
+                             struct buffer_head *map_bh, int create);
+void   xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
 
 extern void xfs_count_page_state(struct page *, int *, int *);
 
index 3fbf167..2bb959a 100644 (file)
@@ -394,7 +394,6 @@ xfs_attr_inactive(
 {
        struct xfs_trans        *trans;
        struct xfs_mount        *mp;
-       int                     cancel_flags = 0;
        int                     lock_mode = XFS_ILOCK_SHARED;
        int                     error = 0;
 
@@ -423,7 +422,6 @@ xfs_attr_inactive(
                goto out_cancel;
 
        lock_mode = XFS_ILOCK_EXCL;
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT;
        xfs_ilock(dp, lock_mode);
 
        if (!XFS_IFORK_Q(dp))
@@ -435,8 +433,14 @@ xfs_attr_inactive(
         */
        xfs_trans_ijoin(trans, dp, 0);
 
-       /* invalidate and truncate the attribute fork extents */
-       if (dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) {
+       /*
+        * Invalidate and truncate the attribute fork extents. Make sure the
+        * fork actually has attributes as otherwise the invalidation has no
+        * blocks to read and returns an error. In this case, just do the fork
+        * removal below.
+        */
+       if (xfs_inode_hasattr(dp) &&
+           dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) {
                error = xfs_attr3_root_inactive(&trans, dp);
                if (error)
                        goto out_cancel;
@@ -449,12 +453,12 @@ xfs_attr_inactive(
        /* Reset the attribute fork - this also destroys the in-core fork */
        xfs_attr_fork_remove(dp, trans);
 
-       error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(trans);
        xfs_iunlock(dp, lock_mode);
        return error;
 
 out_cancel:
-       xfs_trans_cancel(trans, cancel_flags);
+       xfs_trans_cancel(trans);
 out_destroy_fork:
        /* kill the in-core attr fork before we drop the inode lock */
        if (dp->i_afp)
index a52bbd3..0f34886 100644 (file)
@@ -75,28 +75,20 @@ xfs_bmap_finish(
        xfs_efi_log_item_t      *efi;           /* extent free intention */
        int                     error;          /* error return value */
        xfs_bmap_free_item_t    *free;          /* free extent item */
-       struct xfs_trans_res    tres;           /* new log reservation */
        xfs_mount_t             *mp;            /* filesystem mount structure */
        xfs_bmap_free_item_t    *next;          /* next item on free list */
-       xfs_trans_t             *ntp;           /* new transaction pointer */
 
        ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
        if (flist->xbf_count == 0) {
                *committed = 0;
                return 0;
        }
-       ntp = *tp;
-       efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+       efi = xfs_trans_get_efi(*tp, flist->xbf_count);
        for (free = flist->xbf_first; free; free = free->xbfi_next)
-               xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+               xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
                        free->xbfi_blockcount);
 
-       tres.tr_logres = ntp->t_log_res;
-       tres.tr_logcount = ntp->t_log_count;
-       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-       ntp = xfs_trans_dup(*tp);
-       error = xfs_trans_commit(*tp, 0);
-       *tp = ntp;
+       error = xfs_trans_roll(tp, NULL);
        *committed = 1;
        /*
         * We have a new transaction, so we should return committed=1,
@@ -105,19 +97,10 @@ xfs_bmap_finish(
        if (error)
                return error;
 
-       /*
-        * transaction commit worked ok so we can drop the extra ticket
-        * reference that we gained in xfs_trans_dup()
-        */
-       xfs_log_ticket_put(ntp->t_ticket);
-
-       error = xfs_trans_reserve(ntp, &tres, 0, 0);
-       if (error)
-               return error;
-       efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+       efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
        for (free = flist->xbf_first; free != NULL; free = next) {
                next = free->xbfi_next;
-               if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+               if ((error = xfs_free_extent(*tp, free->xbfi_startblock,
                                free->xbfi_blockcount))) {
                        /*
                         * The bmap free list will be cleaned up at a
@@ -127,7 +110,7 @@ xfs_bmap_finish(
                         * happens, since this transaction may not be
                         * dirty yet.
                         */
-                       mp = ntp->t_mountp;
+                       mp = (*tp)->t_mountp;
                        if (!XFS_FORCED_SHUTDOWN(mp))
                                xfs_force_shutdown(mp,
                                                   (error == -EFSCORRUPTED) ?
@@ -135,7 +118,7 @@ xfs_bmap_finish(
                                                   SHUTDOWN_META_IO_ERROR);
                        return error;
                }
-               xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+               xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock,
                        free->xbfi_blockcount);
                xfs_bmap_del_free(flist, NULL, free);
        }
@@ -878,7 +861,7 @@ xfs_free_eofblocks(
 
                if (need_iolock) {
                        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
-                               xfs_trans_cancel(tp, 0);
+                               xfs_trans_cancel(tp);
                                return -EAGAIN;
                        }
                }
@@ -886,7 +869,7 @@ xfs_free_eofblocks(
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
                if (error) {
                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp, 0);
+                       xfs_trans_cancel(tp);
                        if (need_iolock)
                                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
                        return error;
@@ -908,12 +891,9 @@ xfs_free_eofblocks(
                         * If we get an error at this point we simply don't
                         * bother truncating the file.
                         */
-                       xfs_trans_cancel(tp,
-                                        (XFS_TRANS_RELEASE_LOG_RES |
-                                         XFS_TRANS_ABORT));
+                       xfs_trans_cancel(tp);
                } else {
-                       error = xfs_trans_commit(tp,
-                                               XFS_TRANS_RELEASE_LOG_RES);
+                       error = xfs_trans_commit(tp);
                        if (!error)
                                xfs_inode_clear_eofblocks_tag(ip);
                }
@@ -1026,7 +1006,7 @@ xfs_alloc_file_space(
                         * Free the transaction structure.
                         */
                        ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp, 0);
+                       xfs_trans_cancel(tp);
                        break;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1053,7 +1033,7 @@ xfs_alloc_file_space(
                        goto error0;
                }
 
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               error = xfs_trans_commit(tp);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                if (error) {
                        break;
@@ -1077,7 +1057,7 @@ error0:   /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
        xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
 
 error1:        /* Just cancel transaction */
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 }
@@ -1133,14 +1113,29 @@ xfs_zero_remaining_bytes(
                        break;
                ASSERT(imap.br_blockcount >= 1);
                ASSERT(imap.br_startoff == offset_fsb);
+               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+
+               if (imap.br_startblock == HOLESTARTBLOCK ||
+                   imap.br_state == XFS_EXT_UNWRITTEN) {
+                       /* skip the entire extent */
+                       lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
+                                                     imap.br_blockcount) - 1;
+                       continue;
+               }
+
                lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
                if (lastoffset > endoff)
                        lastoffset = endoff;
-               if (imap.br_startblock == HOLESTARTBLOCK)
-                       continue;
-               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-               if (imap.br_state == XFS_EXT_UNWRITTEN)
+
+               /* DAX can just zero the backing device directly */
+               if (IS_DAX(VFS_I(ip))) {
+                       error = dax_zero_page_range(VFS_I(ip), offset,
+                                                   lastoffset - offset + 1,
+                                                   xfs_get_blocks_direct);
+                       if (error)
+                               return error;
                        continue;
+               }
 
                error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -1289,7 +1284,7 @@ xfs_free_file_space(
                         * Free the transaction structure.
                         */
                        ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp, 0);
+                       xfs_trans_cancel(tp);
                        break;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1320,7 +1315,7 @@ xfs_free_file_space(
                        goto error0;
                }
 
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               error = xfs_trans_commit(tp);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
 
@@ -1330,7 +1325,7 @@ xfs_free_file_space(
  error0:
        xfs_bmap_cancel(&free_list);
  error1:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        goto out;
 }
@@ -1462,7 +1457,7 @@ xfs_shift_file_space(
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
                                XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
                if (error) {
-                       xfs_trans_cancel(tp, 0);
+                       xfs_trans_cancel(tp);
                        break;
                }
 
@@ -1492,13 +1487,13 @@ xfs_shift_file_space(
                if (error)
                        goto out;
 
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               error = xfs_trans_commit(tp);
        }
 
        return error;
 
 out:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
        return error;
 }
 
@@ -1718,7 +1713,7 @@ xfs_swap_extents(
        tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                goto out_unlock;
        }
 
@@ -1901,7 +1896,7 @@ xfs_swap_extents(
        if (mp->m_flags & XFS_MOUNT_WSYNC)
                xfs_trans_set_sync(tp);
 
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
 
        trace_xfs_swap_extent_after(ip, 0);
        trace_xfs_swap_extent_after(tip, 1);
@@ -1915,6 +1910,6 @@ out_unlock:
        goto out;
 
 out_trans_cancel:
-       xfs_trans_cancel(tp, 0);
+       xfs_trans_cancel(tp);
        goto out;
 }
index 1790b00..a4b7d92 100644 (file)
@@ -1419,9 +1419,9 @@ xfs_buf_submit_wait(
        return error;
 }
 
-xfs_caddr_t
+void *
 xfs_buf_offset(
-       xfs_buf_t               *bp,
+       struct xfs_buf          *bp,
        size_t                  offset)
 {
        struct page             *page;
@@ -1431,7 +1431,7 @@ xfs_buf_offset(
 
        offset += bp->b_offset;
        page = bp->b_pages[offset >> PAGE_SHIFT];
-       return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
+       return page_address(page) + (offset & (PAGE_SIZE-1));
 }
 
 /*
index 75ff5d5..331c1cc 100644 (file)
@@ -299,7 +299,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
            xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
 
 /* Buffer Utility Routines */
-extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
+extern void *xfs_buf_offset(struct xfs_buf *, size_t);
 
 /* Delayed Write Buffer Routines */
 extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
index 02c01bb..4143dc7 100644 (file)
@@ -568,8 +568,6 @@ xfs_qm_dqread(
        struct xfs_buf          *bp;
        struct xfs_trans        *tp = NULL;
        int                     error;
-       int                     cancelflags = 0;
-
 
        dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
 
@@ -617,7 +615,6 @@ xfs_qm_dqread(
                                          XFS_QM_DQALLOC_SPACE_RES(mp), 0);
                if (error)
                        goto error1;
-               cancelflags = XFS_TRANS_RELEASE_LOG_RES;
        }
 
        /*
@@ -632,7 +629,6 @@ xfs_qm_dqread(
                 * allocate (ENOENT).
                 */
                trace_xfs_dqread_fail(dqp);
-               cancelflags |= XFS_TRANS_ABORT;
                goto error1;
        }
 
@@ -670,7 +666,7 @@ xfs_qm_dqread(
        xfs_trans_brelse(tp, bp);
 
        if (tp) {
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               error = xfs_trans_commit(tp);
                if (error)
                        goto error0;
        }
@@ -680,7 +676,7 @@ xfs_qm_dqread(
 
 error1:
        if (tp)
-               xfs_trans_cancel(tp, cancelflags);
+               xfs_trans_cancel(tp);
 error0:
        xfs_qm_dqdestroy(dqp);
        *O_dqpp = NULL;
index 338e50b..74d0e59 100644 (file)
@@ -127,7 +127,7 @@ xfs_error_report(
        struct xfs_mount        *mp,
        const char              *filename,
        int                     linenum,
-       inst_t                  *ra)
+       void                    *ra)
 {
        if (level <= xfs_error_level) {
                xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
@@ -146,7 +146,7 @@ xfs_corruption_error(
        void                    *p,
        const char              *filename,
        int                     linenum,
-       inst_t                  *ra)
+       void                    *ra)
 {
        if (level <= xfs_error_level)
                xfs_hex_dump(p, 64);
index c0394ed..4ed3042 100644 (file)
 struct xfs_mount;
 
 extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
-                       const char *filename, int linenum, inst_t *ra);
+                       const char *filename, int linenum, void *ra);
 extern void xfs_corruption_error(const char *tag, int level,
                        struct xfs_mount *mp, void *p, const char *filename,
-                       int linenum, inst_t *ra);
+                       int linenum, void *ra);
 extern void xfs_verifier_error(struct xfs_buf *bp);
 
 #define        XFS_ERROR_REPORT(e, lvl, mp)    \
index cb7fe64..adc8f8f 100644 (file)
@@ -239,7 +239,7 @@ xfs_efi_init(
 
        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
        efip->efi_format.efi_nextents = nextents;
-       efip->efi_format.efi_id = (__psint_t)(void*)efip;
+       efip->efi_format.efi_id = (uintptr_t)(void *)efip;
        atomic_set(&efip->efi_next_extent, 0);
        atomic_set(&efip->efi_refcount, 2);
 
index 7c62fca..874507d 100644 (file)
@@ -80,14 +80,15 @@ xfs_rw_ilock_demote(
 }
 
 /*
- *     xfs_iozero
+ * xfs_iozero clears the specified range supplied via the page cache (except in
+ * the DAX case). Writes through the page cache will allocate blocks over holes,
+ * though the callers usually map the holes first and avoid them. If a block is
+ * not completely zeroed, then it will be read from disk before being partially
+ * zeroed.
  *
- *     xfs_iozero clears the specified range of buffer supplied,
- *     and marks all the affected blocks as valid and modified.  If
- *     an affected block is not allocated, it will be allocated.  If
- *     an affected block is not completely overwritten, and is not
- *     valid before the operation, it will be read from disk before
- *     being partially zeroed.
+ * In the DAX case, we can just directly write to the underlying pages. This
+ * will not allocate blocks, but will avoid holes and unwritten extents and so
+ * not do unnecessary work.
  */
 int
 xfs_iozero(
@@ -97,7 +98,8 @@ xfs_iozero(
 {
        struct page             *page;
        struct address_space    *mapping;
-       int                     status;
+       int                     status = 0;
+
 
        mapping = VFS_I(ip)->i_mapping;
        do {
@@ -109,20 +111,27 @@ xfs_iozero(
                if (bytes > count)
                        bytes = count;
 
-               status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                       AOP_FLAG_UNINTERRUPTIBLE,
-                                       &page, &fsdata);
-               if (status)
-                       break;
+               if (IS_DAX(VFS_I(ip))) {
+                       status = dax_zero_page_range(VFS_I(ip), pos, bytes,
+                                                    xfs_get_blocks_direct);
+                       if (status)
+                               break;
+               } else {
+                       status = pagecache_write_begin(NULL, mapping, pos, bytes,
+                                               AOP_FLAG_UNINTERRUPTIBLE,
+                                               &page, &fsdata);
+                       if (status)
+                               break;
 
-               zero_user(page, offset, bytes);
+                       zero_user(page, offset, bytes);
 
-               status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
-                                       page, fsdata);
-               WARN_ON(status <= 0); /* can't return less than zero! */
+                       status = pagecache_write_end(NULL, mapping, pos, bytes,
+                                               bytes, page, fsdata);
+                       WARN_ON(status <= 0); /* can't return less than zero! */
+                       status = 0;
+               }
                pos += bytes;
                count -= bytes;
-               status = 0;
        } while (count);
 
        return status;
@@ -139,7 +148,7 @@ xfs_update_prealloc_flags(
        tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
        error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -161,7 +170,7 @@ xfs_update_prealloc_flags(
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        if (flags & XFS_PREALLOC_SYNC)
                xfs_trans_set_sync(tp);
-       return xfs_trans_commit(tp, 0);
+       return xfs_trans_commit(tp);
 }
 
 /*
@@ -285,7 +294,7 @@ xfs_file_read_iter(
        if (file->f_mode & FMODE_NOCMTIME)
                ioflags |= XFS_IO_INVIS;
 
-       if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
+       if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
                xfs_buftarg_t   *target =
                        XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -379,7 +388,11 @@ xfs_file_splice_read(
 
        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
 
-       ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+       /* for dax, we need to avoid the page cache */
+       if (IS_DAX(VFS_I(ip)))
+               ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
+       else
+               ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
 
@@ -673,7 +686,7 @@ xfs_file_dio_aio_write(
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
 
        /* DIO must be aligned to device logical sector size */
-       if ((pos | count) & target->bt_logical_sectormask)
+       if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
                return -EINVAL;
 
        /* "unaligned" here means not aligned to a filesystem block */
@@ -759,8 +772,11 @@ xfs_file_dio_aio_write(
 out:
        xfs_rw_iunlock(ip, iolock);
 
-       /* No fallback to buffered IO on errors for XFS. */
-       ASSERT(ret < 0 || ret == count);
+       /*
+        * No fallback to buffered IO on errors for XFS. DAX can result in
+        * partial writes, but direct IO will either complete fully or fail.
+        */
+       ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
        return ret;
 }
 
@@ -843,7 +859,7 @@ xfs_file_write_iter(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
 
-       if (unlikely(iocb->ki_flags & IOCB_DIRECT))
+       if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
                ret = xfs_file_dio_aio_write(iocb, from);
        else
                ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1064,17 +1080,6 @@ xfs_file_readdir(
        return xfs_readdir(ip, ctx, bufsize);
 }
 
-STATIC int
-xfs_file_mmap(
-       struct file     *filp,
-       struct vm_area_struct *vma)
-{
-       vma->vm_ops = &xfs_file_vm_ops;
-
-       file_accessed(filp);
-       return 0;
-}
-
 /*
  * This type is designed to indicate the type of offset we would like
  * to search from page cache for xfs_seek_hole_data().
@@ -1455,48 +1460,83 @@ xfs_file_llseek(
  * ordering of:
  *
  * mmap_sem (MM)
- *   i_mmap_lock (XFS - truncate serialisation)
- *     page_lock (MM)
- *       i_lock (XFS - extent map serialisation)
+ *   sb_start_pagefault(vfs, freeze)
+ *     i_mmap_lock (XFS - truncate serialisation)
+ *       page_lock (MM)
+ *         i_lock (XFS - extent map serialisation)
+ */
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
  */
 STATIC int
-xfs_filemap_fault(
+xfs_filemap_page_mkwrite(
        struct vm_area_struct   *vma,
        struct vm_fault         *vmf)
 {
-       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
-       int                     error;
+       struct inode            *inode = file_inode(vma->vm_file);
+       int                     ret;
 
-       trace_xfs_filemap_fault(ip);
+       trace_xfs_filemap_page_mkwrite(XFS_I(inode));
 
-       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-       error = filemap_fault(vma, vmf);
-       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+       xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
-       return error;
+       if (IS_DAX(inode)) {
+               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
+                                   xfs_end_io_dax_write);
+       } else {
+               ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+               ret = block_page_mkwrite_return(ret);
+       }
+
+       xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+       sb_end_pagefault(inode->i_sb);
+
+       return ret;
 }
 
-/*
- * mmap()d file has taken write protection fault and is being made writable. We
- * can set the page state up correctly for a writable page, which means we can
- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- * mapping.
- */
 STATIC int
-xfs_filemap_page_mkwrite(
+xfs_filemap_fault(
        struct vm_area_struct   *vma,
        struct vm_fault         *vmf)
 {
-       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
-       int                     error;
+       struct xfs_inode        *ip = XFS_I(file_inode(vma->vm_file));
+       int                     ret;
+
+       trace_xfs_filemap_fault(ip);
 
-       trace_xfs_filemap_page_mkwrite(ip);
+       /* DAX can shortcut the normal fault path on write faults! */
+       if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
+               return xfs_filemap_page_mkwrite(vma, vmf);
 
        xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-       error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+       ret = filemap_fault(vma, vmf);
        xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 
-       return error;
+       return ret;
+}
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+       .fault          = xfs_filemap_fault,
+       .map_pages      = filemap_map_pages,
+       .page_mkwrite   = xfs_filemap_page_mkwrite,
+};
+
+STATIC int
+xfs_file_mmap(
+       struct file     *filp,
+       struct vm_area_struct *vma)
+{
+       file_accessed(filp);
+       vma->vm_ops = &xfs_file_vm_ops;
+       if (IS_DAX(file_inode(filp)))
+               vma->vm_flags |= VM_MIXEDMAP;
+       return 0;
 }
 
 const struct file_operations xfs_file_operations = {
@@ -1527,9 +1567,3 @@ const struct file_operations xfs_dir_file_operations = {
 #endif
        .fsync          = xfs_dir_fsync,
 };
-
-static const struct vm_operations_struct xfs_file_vm_ops = {
-       .fault          = xfs_filemap_fault,
-       .map_pages      = filemap_map_pages,
-       .page_mkwrite   = xfs_filemap_page_mkwrite,
-};
index da82f1c..c4c130f 100644 (file)
@@ -196,7 +196,8 @@ xfs_filestream_pick_ag(
                        goto next_ag;
                }
 
-               longest = xfs_alloc_longest_free_extent(mp, pag);
+               longest = xfs_alloc_longest_free_extent(mp, pag,
+                                       xfs_alloc_min_freelist(mp, pag));
                if (((minlen && longest >= minlen) ||
                     (!minlen && pag->pagf_freeblks >= minfree)) &&
                    (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
index cb7e8a2..9b3438a 100644 (file)
@@ -101,7 +101,9 @@ xfs_fs_geometry(
                        (xfs_sb_version_hasftype(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
                        (xfs_sb_version_hasfinobt(&mp->m_sb) ?
-                               XFS_FSOP_GEOM_FLAGS_FINOBT : 0);
+                               XFS_FSOP_GEOM_FLAGS_FINOBT : 0) |
+                       (xfs_sb_version_hassparseinodes(&mp->m_sb) ?
+                               XFS_FSOP_GEOM_FLAGS_SPINODES : 0);
                geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
                                mp->m_sb.sb_logsectsize : BBSIZE;
                geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -201,7 +203,7 @@ xfs_growfs_data_private(
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
                                  XFS_GROWFS_SPACE_RES(mp), 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -489,7 +491,7 @@ xfs_growfs_data_private(
        if (dpct)
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
        xfs_trans_set_sync(tp);
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
        if (error)
                return error;
 
@@ -557,7 +559,7 @@ xfs_growfs_data_private(
        return saved_error ? saved_error : error;
 
  error0:
-       xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
        return error;
 }
 
index 539a85f..3da9f4d 100644 (file)
@@ -905,7 +905,6 @@ xfs_dir_ialloc(
 
 {
        xfs_trans_t     *tp;
-       xfs_trans_t     *ntp;
        xfs_inode_t     *ip;
        xfs_buf_t       *ialloc_context = NULL;
        int             code;
@@ -954,8 +953,6 @@ xfs_dir_ialloc(
         * to succeed the second time.
         */
        if (ialloc_context) {
-               struct xfs_trans_res tres;
-
                /*
                 * Normally, xfs_trans_commit releases all the locks.
                 * We call bhold to hang on to the ialloc_context across
@@ -964,12 +961,6 @@ xfs_dir_ialloc(
                 * allocation group.
                 */
                xfs_trans_bhold(tp, ialloc_context);
-               /*
-                * Save the log reservation so we can use
-                * them in the next transaction.
-                */
-               tres.tr_logres = xfs_trans_get_log_res(tp);
-               tres.tr_logcount = xfs_trans_get_log_count(tp);
 
                /*
                 * We want the quota changes to be associated with the next
@@ -985,35 +976,9 @@ xfs_dir_ialloc(
                        tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
                }
 
-               ntp = xfs_trans_dup(tp);
-               code = xfs_trans_commit(tp, 0);
-               tp = ntp;
-               if (committed != NULL) {
+               code = xfs_trans_roll(&tp, 0);
+               if (committed != NULL)
                        *committed = 1;
-               }
-               /*
-                * If we get an error during the commit processing,
-                * release the buffer that is still held and return
-                * to the caller.
-                */
-               if (code) {
-                       xfs_buf_relse(ialloc_context);
-                       if (dqinfo) {
-                               tp->t_dqinfo = dqinfo;
-                               xfs_trans_free_dqinfo(tp);
-                       }
-                       *tpp = ntp;
-                       *ipp = NULL;
-                       return code;
-               }
-
-               /*
-                * transaction commit worked ok so we can drop the extra ticket
-                * reference that we gained in xfs_trans_dup()
-                */
-               xfs_log_ticket_put(tp->t_ticket);
-               tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-               code = xfs_trans_reserve(tp, &tres, 0, 0);
 
                /*
                 * Re-attach the quota info that we detached from prev trx.
@@ -1025,7 +990,7 @@ xfs_dir_ialloc(
 
                if (code) {
                        xfs_buf_relse(ialloc_context);
-                       *tpp = ntp;
+                       *tpp = tp;
                        *ipp = NULL;
                        return code;
                }
@@ -1127,7 +1092,6 @@ xfs_create(
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
        bool                    unlock_dp_on_error = false;
-       uint                    cancel_flags;
        int                     committed;
        prid_t                  prid;
        struct xfs_dquot        *udqp = NULL;
@@ -1164,8 +1128,6 @@ xfs_create(
                tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
        }
 
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-
        /*
         * Initially assume that the file does not exist and
         * reserve the resources for that case.  If that is not
@@ -1183,10 +1145,9 @@ xfs_create(
                resblks = 0;
                error = xfs_trans_reserve(tp, tres, 0, 0);
        }
-       if (error) {
-               cancel_flags = 0;
+       if (error)
                goto out_trans_cancel;
-       }
+
 
        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
        unlock_dp_on_error = true;
@@ -1217,7 +1178,7 @@ xfs_create(
        if (error) {
                if (error == -ENOSPC)
                        goto out_trans_cancel;
-               goto out_trans_abort;
+               goto out_trans_cancel;
        }
 
        /*
@@ -1235,7 +1196,7 @@ xfs_create(
                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
        if (error) {
                ASSERT(error != -ENOSPC);
-               goto out_trans_abort;
+               goto out_trans_cancel;
        }
        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -1269,7 +1230,7 @@ xfs_create(
        if (error)
                goto out_bmap_cancel;
 
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
        if (error)
                goto out_release_inode;
 
@@ -1282,10 +1243,8 @@ xfs_create(
 
  out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
- out_trans_abort:
-       cancel_flags |= XFS_TRANS_ABORT;
  out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
  out_release_inode:
        /*
         * Wait until after the current transaction is aborted to finish the
@@ -1317,7 +1276,6 @@ xfs_create_tmpfile(
        struct xfs_inode        *ip = NULL;
        struct xfs_trans        *tp = NULL;
        int                     error;
-       uint                    cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        prid_t                  prid;
        struct xfs_dquot        *udqp = NULL;
        struct xfs_dquot        *gdqp = NULL;
@@ -1350,10 +1308,8 @@ xfs_create_tmpfile(
                resblks = 0;
                error = xfs_trans_reserve(tp, tres, 0, 0);
        }
-       if (error) {
-               cancel_flags = 0;
+       if (error)
                goto out_trans_cancel;
-       }
 
        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
                                                pdqp, resblks, 1, 0);
@@ -1365,7 +1321,7 @@ xfs_create_tmpfile(
        if (error) {
                if (error == -ENOSPC)
                        goto out_trans_cancel;
-               goto out_trans_abort;
+               goto out_trans_cancel;
        }
 
        if (mp->m_flags & XFS_MOUNT_WSYNC)
@@ -1381,9 +1337,9 @@ xfs_create_tmpfile(
        ip->i_d.di_nlink--;
        error = xfs_iunlink(tp, ip);
        if (error)
-               goto out_trans_abort;
+               goto out_trans_cancel;
 
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
        if (error)
                goto out_release_inode;
 
@@ -1394,10 +1350,8 @@ xfs_create_tmpfile(
        *ipp = ip;
        return 0;
 
- out_trans_abort:
-       cancel_flags |= XFS_TRANS_ABORT;
  out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
  out_release_inode:
        /*
         * Wait until after the current transaction is aborted to finish the
@@ -1427,7 +1381,6 @@ xfs_link(
        int                     error;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
-       int                     cancel_flags;
        int                     committed;
        int                     resblks;
 
@@ -1447,17 +1400,14 @@ xfs_link(
                goto std_return;
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
        if (error == -ENOSPC) {
                resblks = 0;
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
        }
-       if (error) {
-               cancel_flags = 0;
+       if (error)
                goto error_return;
-       }
 
        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
 
@@ -1486,19 +1436,19 @@ xfs_link(
        if (sip->i_d.di_nlink == 0) {
                error = xfs_iunlink_remove(tp, sip);
                if (error)
-                       goto abort_return;
+                       goto error_return;
        }
 
        error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
                                        &first_block, &free_list, resblks);
        if (error)
-               goto abort_return;
+               goto error_return;
        xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
 
        error = xfs_bumplink(tp, sip);
        if (error)
-               goto abort_return;
+               goto error_return;
 
        /*
         * If this is a synchronous mount, make sure that the
@@ -1512,15 +1462,13 @@ xfs_link(
        error = xfs_bmap_finish (&tp, &free_list, &committed);
        if (error) {
                xfs_bmap_cancel(&free_list);
-               goto abort_return;
+               goto error_return;
        }
 
-       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       return xfs_trans_commit(tp);
 
- abort_return:
-       cancel_flags |= XFS_TRANS_ABORT;
  error_return:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
  std_return:
        return error;
 }
@@ -1555,7 +1503,6 @@ xfs_itruncate_extents(
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp = *tpp;
-       struct xfs_trans        *ntp;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
        xfs_fileoff_t           first_unmap_block;
@@ -1613,29 +1560,7 @@ xfs_itruncate_extents(
                if (error)
                        goto out_bmap_cancel;
 
-               if (committed) {
-                       /*
-                        * Mark the inode dirty so it will be logged and
-                        * moved forward in the log as part of every commit.
-                        */
-                       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-               }
-
-               ntp = xfs_trans_dup(tp);
-               error = xfs_trans_commit(tp, 0);
-               tp = ntp;
-
-               xfs_trans_ijoin(tp, ip, 0);
-
-               if (error)
-                       goto out;
-
-               /*
-                * Transaction commit worked ok so we can drop the extra ticket
-                * reference that we gained in xfs_trans_dup()
-                */
-               xfs_log_ticket_put(tp->t_ticket);
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+               error = xfs_trans_roll(&tp, ip);
                if (error)
                        goto out;
        }
@@ -1756,7 +1681,7 @@ xfs_inactive_truncate(
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error) {
                ASSERT(XFS_FORCED_SHUTDOWN(mp));
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -1777,7 +1702,7 @@ xfs_inactive_truncate(
 
        ASSERT(ip->i_d.di_nextents == 0);
 
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
        if (error)
                goto error_unlock;
 
@@ -1785,7 +1710,7 @@ xfs_inactive_truncate(
        return 0;
 
 error_trans_cancel:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
 error_unlock:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
@@ -1835,7 +1760,7 @@ xfs_inactive_ifree(
                } else {
                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
                }
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -1855,7 +1780,7 @@ xfs_inactive_ifree(
                                __func__, error);
                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
                }
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+               xfs_trans_cancel(tp);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                return error;
        }
@@ -1874,7 +1799,7 @@ xfs_inactive_ifree(
        if (error)
                xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
                        __func__, error);
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
        if (error)
                xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
                        __func__, error);
@@ -2235,28 +2160,42 @@ xfs_iunlink_remove(
  */
 STATIC int
 xfs_ifree_cluster(
-       xfs_inode_t     *free_ip,
-       xfs_trans_t     *tp,
-       xfs_ino_t       inum)
+       xfs_inode_t             *free_ip,
+       xfs_trans_t             *tp,
+       struct xfs_icluster     *xic)
 {
        xfs_mount_t             *mp = free_ip->i_mount;
        int                     blks_per_cluster;
        int                     inodes_per_cluster;
        int                     nbufs;
        int                     i, j;
+       int                     ioffset;
        xfs_daddr_t             blkno;
        xfs_buf_t               *bp;
        xfs_inode_t             *ip;
        xfs_inode_log_item_t    *iip;
        xfs_log_item_t          *lip;
        struct xfs_perag        *pag;
+       xfs_ino_t               inum;
 
+       inum = xic->first_ino;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
        blks_per_cluster = xfs_icluster_size_fsb(mp);
        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
        nbufs = mp->m_ialloc_blks / blks_per_cluster;
 
        for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
+               /*
+                * The allocation bitmap tells us which inodes of the chunk were
+                * physically allocated. Skip the cluster if an inode falls into
+                * a sparse region.
+                */
+               ioffset = inum - xic->first_ino;
+               if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
+                       ASSERT(do_mod(ioffset, inodes_per_cluster) == 0);
+                       continue;
+               }
+
                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
                                         XFS_INO_TO_AGBNO(mp, inum));
 
@@ -2414,8 +2353,7 @@ xfs_ifree(
        xfs_bmap_free_t *flist)
 {
        int                     error;
-       int                     delete;
-       xfs_ino_t               first_ino;
+       struct xfs_icluster     xic = { 0 };
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ASSERT(ip->i_d.di_nlink == 0);
@@ -2431,7 +2369,7 @@ xfs_ifree(
        if (error)
                return error;
 
-       error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
+       error = xfs_difree(tp, ip->i_ino, flist, &xic);
        if (error)
                return error;
 
@@ -2448,8 +2386,8 @@ xfs_ifree(
        ip->i_d.di_gen++;
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-       if (delete)
-               error = xfs_ifree_cluster(ip, tp, first_ino);
+       if (xic.deleted)
+               error = xfs_ifree_cluster(ip, tp, &xic);
 
        return error;
 }
@@ -2536,7 +2474,6 @@ xfs_remove(
        int                     error = 0;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
-       int                     cancel_flags;
        int                     committed;
        uint                    resblks;
 
@@ -2557,7 +2494,6 @@ xfs_remove(
                tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
        else
                tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 
        /*
         * We try to get the real space reservation first,
@@ -2576,7 +2512,6 @@ xfs_remove(
        }
        if (error) {
                ASSERT(error != -ENOSPC);
-               cancel_flags = 0;
                goto out_trans_cancel;
        }
 
@@ -2588,7 +2523,6 @@ xfs_remove(
        /*
         * If we're removing a directory perform some additional validation.
         */
-       cancel_flags |= XFS_TRANS_ABORT;
        if (is_dir) {
                ASSERT(ip->i_d.di_nlink >= 2);
                if (ip->i_d.di_nlink != 2) {
@@ -2644,7 +2578,7 @@ xfs_remove(
        if (error)
                goto out_bmap_cancel;
 
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
        if (error)
                goto std_return;
 
@@ -2656,7 +2590,7 @@ xfs_remove(
  out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
  out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
  std_return:
        return error;
 }
@@ -2730,11 +2664,11 @@ xfs_finish_rename(
        error = xfs_bmap_finish(&tp, free_list, &committed);
        if (error) {
                xfs_bmap_cancel(free_list);
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+               xfs_trans_cancel(tp);
                return error;
        }
 
-       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       return xfs_trans_commit(tp);
 }
 
 /*
@@ -2855,7 +2789,7 @@ xfs_cross_rename(
 
 out_trans_abort:
        xfs_bmap_cancel(free_list);
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
        return error;
 }
 
@@ -2915,7 +2849,6 @@ xfs_rename(
        int                     num_inodes = __XFS_SORT_INODES;
        bool                    new_parent = (src_dp != target_dp);
        bool                    src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
-       int                     cancel_flags = 0;
        int                     spaceres;
        int                     error;
 
@@ -2951,7 +2884,6 @@ xfs_rename(
        }
        if (error)
                goto out_trans_cancel;
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 
        /*
         * Attach the dquots to the inodes
@@ -3022,10 +2954,8 @@ xfs_rename(
                error = xfs_dir_createname(tp, target_dp, target_name,
                                                src_ip->i_ino, &first_block,
                                                &free_list, spaceres);
-               if (error == -ENOSPC)
-                       goto out_bmap_cancel;
                if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
 
                xfs_trans_ichgtime(tp, target_dp,
                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3033,7 +2963,7 @@ xfs_rename(
                if (new_parent && src_is_directory) {
                        error = xfs_bumplink(tp, target_dp);
                        if (error)
-                               goto out_trans_abort;
+                               goto out_bmap_cancel;
                }
        } else { /* target_ip != NULL */
                /*
@@ -3065,7 +2995,7 @@ xfs_rename(
                                        src_ip->i_ino,
                                        &first_block, &free_list, spaceres);
                if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
 
                xfs_trans_ichgtime(tp, target_dp,
                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3076,7 +3006,7 @@ xfs_rename(
                 */
                error = xfs_droplink(tp, target_ip);
                if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
 
                if (src_is_directory) {
                        /*
@@ -3084,7 +3014,7 @@ xfs_rename(
                         */
                        error = xfs_droplink(tp, target_ip);
                        if (error)
-                               goto out_trans_abort;
+                               goto out_bmap_cancel;
                }
        } /* target_ip != NULL */
 
@@ -3101,7 +3031,7 @@ xfs_rename(
                                        &first_block, &free_list, spaceres);
                ASSERT(error != -EEXIST);
                if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
        }
 
        /*
@@ -3127,7 +3057,7 @@ xfs_rename(
                 */
                error = xfs_droplink(tp, src_dp);
                if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
        }
 
        /*
@@ -3142,7 +3072,7 @@ xfs_rename(
                error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
                                           &first_block, &free_list, spaceres);
        if (error)
-               goto out_trans_abort;
+               goto out_bmap_cancel;
 
        /*
         * For whiteouts, we need to bump the link count on the whiteout inode.
@@ -3156,10 +3086,10 @@ xfs_rename(
                ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0);
                error = xfs_bumplink(tp, wip);
                if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
                error = xfs_iunlink_remove(tp, wip);
                if (error)
-                       goto out_trans_abort;
+                       goto out_bmap_cancel;
                xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
 
                /*
@@ -3180,12 +3110,10 @@ xfs_rename(
                IRELE(wip);
        return error;
 
-out_trans_abort:
-       cancel_flags |= XFS_TRANS_ABORT;
 out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
 out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
        if (wip)
                IRELE(wip);
        return error;
@@ -3464,7 +3392,7 @@ xfs_iflush_int(
        ASSERT(ip->i_d.di_version > 1);
 
        /* set *dip = inode's place in the buffer */
-       dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
+       dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
 
        if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
index 87f67c6..ea7d85a 100644 (file)
@@ -336,7 +336,7 @@ xfs_set_dmattrs(
        tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -346,7 +346,7 @@ xfs_set_dmattrs(
        ip->i_d.di_dmstate  = state;
 
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
 
        return error;
 }
@@ -1076,7 +1076,7 @@ xfs_ioctl_setattr_get_trans(
        return tp;
 
 out_cancel:
-       xfs_trans_cancel(tp, 0);
+       xfs_trans_cancel(tp);
        return ERR_PTR(error);
 }
 
@@ -1253,7 +1253,7 @@ xfs_ioctl_setattr(
        else
                ip->i_d.di_extsize = 0;
 
-       code = xfs_trans_commit(tp, 0);
+       code = xfs_trans_commit(tp);
 
        /*
         * Release any dquot(s) the inode had kept before chown.
@@ -1265,7 +1265,7 @@ xfs_ioctl_setattr(
        return code;
 
 error_trans_cancel:
-       xfs_trans_cancel(tp, 0);
+       xfs_trans_cancel(tp);
 error_free_dquots:
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(pdqp);
@@ -1338,11 +1338,11 @@ xfs_ioc_setxflags(
 
        error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                goto out_drop_write;
        }
 
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
 out_drop_write:
        mnt_drop_write_file(filp);
        return error;
index 38e633b..1f86033 100644 (file)
@@ -183,7 +183,7 @@ xfs_iomap_write_direct(
         * Check for running out of space, note: need lock to return
         */
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -213,7 +213,7 @@ xfs_iomap_write_direct(
        error = xfs_bmap_finish(&tp, &free_list, &committed);
        if (error)
                goto out_bmap_cancel;
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
        if (error)
                goto out_unlock;
 
@@ -236,7 +236,7 @@ out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
        xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
 out_trans_cancel:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
        goto out_unlock;
 }
 
@@ -690,7 +690,7 @@ xfs_iomap_write_allocate(
                        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
                                                  nres, 0);
                        if (error) {
-                               xfs_trans_cancel(tp, 0);
+                               xfs_trans_cancel(tp);
                                return error;
                        }
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -760,7 +760,7 @@ xfs_iomap_write_allocate(
                        if (error)
                                goto trans_cancel;
 
-                       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+                       error = xfs_trans_commit(tp);
                        if (error)
                                goto error0;
 
@@ -791,7 +791,7 @@ xfs_iomap_write_allocate(
 
 trans_cancel:
        xfs_bmap_cancel(&free_list);
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
 error0:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
@@ -853,7 +853,7 @@ xfs_iomap_write_unwritten(
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
                                          resblks, 0);
                if (error) {
-                       xfs_trans_cancel(tp, 0);
+                       xfs_trans_cancel(tp);
                        return error;
                }
 
@@ -890,7 +890,7 @@ xfs_iomap_write_unwritten(
                if (error)
                        goto error_on_bmapi_transaction;
 
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               error = xfs_trans_commit(tp);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                if (error)
                        return error;
@@ -914,7 +914,7 @@ xfs_iomap_write_unwritten(
 
 error_on_bmapi_transaction:
        xfs_bmap_cancel(&free_list);
-       xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
+       xfs_trans_cancel(tp);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 }
index 7f51f39..766b23f 100644 (file)
@@ -699,7 +699,7 @@ xfs_setattr_nonsize(
 
        if (mp->m_flags & XFS_MOUNT_WSYNC)
                xfs_trans_set_sync(tp);
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
 
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
@@ -730,7 +730,7 @@ xfs_setattr_nonsize(
        return 0;
 
 out_trans_cancel:
-       xfs_trans_cancel(tp, 0);
+       xfs_trans_cancel(tp);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out_dqrele:
        xfs_qm_dqrele(udqp);
@@ -752,7 +752,6 @@ xfs_setattr_size(
        struct xfs_trans        *tp;
        int                     error;
        uint                    lock_flags = 0;
-       uint                    commit_flags = 0;
        bool                    did_zeroing = false;
 
        trace_xfs_setattr(ip);
@@ -848,7 +847,11 @@ xfs_setattr_size(
         * to hope that the caller sees ENOMEM and retries the truncate
         * operation.
         */
-       error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
+       if (IS_DAX(inode))
+               error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
+       else
+               error = block_truncate_page(inode->i_mapping, newsize,
+                                           xfs_get_blocks);
        if (error)
                return error;
        truncate_setsize(inode, newsize);
@@ -858,7 +861,6 @@ xfs_setattr_size(
        if (error)
                goto out_trans_cancel;
 
-       commit_flags = XFS_TRANS_RELEASE_LOG_RES;
        lock_flags |= XFS_ILOCK_EXCL;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, 0);
@@ -898,7 +900,7 @@ xfs_setattr_size(
        if (newsize <= oldsize) {
                error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
                if (error)
-                       goto out_trans_abort;
+                       goto out_trans_cancel;
 
                /*
                 * Truncated "down", so we're removing references to old data
@@ -925,16 +927,14 @@ xfs_setattr_size(
        if (mp->m_flags & XFS_MOUNT_WSYNC)
                xfs_trans_set_sync(tp);
 
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
 out_unlock:
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
        return error;
 
-out_trans_abort:
-       commit_flags |= XFS_TRANS_ABORT;
 out_trans_cancel:
-       xfs_trans_cancel(tp, commit_flags);
+       xfs_trans_cancel(tp);
        goto out_unlock;
 }
 
@@ -981,7 +981,7 @@ xfs_vn_update_time(
        tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -1003,7 +1003,7 @@ xfs_vn_update_time(
        }
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
-       return xfs_trans_commit(tp, 0);
+       return xfs_trans_commit(tp);
 }
 
 #define XFS_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -1188,22 +1188,22 @@ xfs_diflags_to_iflags(
        struct inode            *inode,
        struct xfs_inode        *ip)
 {
-       if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+       uint16_t                flags = ip->i_d.di_flags;
+
+       inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
+                           S_NOATIME | S_DAX);
+
+       if (flags & XFS_DIFLAG_IMMUTABLE)
                inode->i_flags |= S_IMMUTABLE;
-       else
-               inode->i_flags &= ~S_IMMUTABLE;
-       if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+       if (flags & XFS_DIFLAG_APPEND)
                inode->i_flags |= S_APPEND;
-       else
-               inode->i_flags &= ~S_APPEND;
-       if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+       if (flags & XFS_DIFLAG_SYNC)
                inode->i_flags |= S_SYNC;
-       else
-               inode->i_flags &= ~S_SYNC;
-       if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+       if (flags & XFS_DIFLAG_NOATIME)
                inode->i_flags |= S_NOATIME;
-       else
-               inode->i_flags &= ~S_NOATIME;
+       /* XXX: Also needs an on-disk per inode flag! */
+       if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
+               inode->i_flags |= S_DAX;
 }
 
 /*
index 8042989..f41b0c3 100644 (file)
@@ -252,7 +252,7 @@ xfs_bulkstat_grab_ichunk(
                }
 
                irec->ir_free |= xfs_inobt_maskn(0, idx);
-               *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount;
+               *icount = irec->ir_count - irec->ir_freecount;
        }
 
        return 0;
@@ -415,6 +415,8 @@ xfs_bulkstat(
                                goto del_cursor;
                        if (icount) {
                                irbp->ir_startino = r.ir_startino;
+                               irbp->ir_holemask = r.ir_holemask;
+                               irbp->ir_count = r.ir_count;
                                irbp->ir_freecount = r.ir_freecount;
                                irbp->ir_free = r.ir_free;
                                irbp++;
@@ -447,13 +449,15 @@ xfs_bulkstat(
                         * If this chunk has any allocated inodes, save it.
                         * Also start read-ahead now for this chunk.
                         */
-                       if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+                       if (r.ir_freecount < r.ir_count) {
                                xfs_bulkstat_ichunk_ra(mp, agno, &r);
                                irbp->ir_startino = r.ir_startino;
+                               irbp->ir_holemask = r.ir_holemask;
+                               irbp->ir_count = r.ir_count;
                                irbp->ir_freecount = r.ir_freecount;
                                irbp->ir_free = r.ir_free;
                                irbp++;
-                               icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
+                               icount += r.ir_count - r.ir_freecount;
                        }
                        error = xfs_btree_increment(cur, 0, &stat);
                        if (error || stat == 0) {
@@ -599,8 +603,7 @@ xfs_inumbers(
                agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
                buffer[bufidx].xi_startino =
                        XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
-               buffer[bufidx].xi_alloccount =
-                       XFS_INODES_PER_CHUNK - r.ir_freecount;
+               buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount;
                buffer[bufidx].xi_allocmask = ~r.ir_free;
                if (++bufidx == bcount) {
                        long    written;
index 7c7842c..85f883d 100644 (file)
@@ -32,26 +32,12 @@ typedef unsigned int                __uint32_t;
 typedef signed long long int   __int64_t;
 typedef unsigned long long int __uint64_t;
 
-typedef __uint32_t             inst_t;         /* an instruction */
-
 typedef __s64                  xfs_off_t;      /* <file offset> type */
 typedef unsigned long long     xfs_ino_t;      /* <inode> type */
 typedef __s64                  xfs_daddr_t;    /* <disk address> type */
-typedef char *                 xfs_caddr_t;    /* <core address> type */
 typedef __u32                  xfs_dev_t;
 typedef __u32                  xfs_nlink_t;
 
-/* __psint_t is the same size as a pointer */
-#if (BITS_PER_LONG == 32)
-typedef __int32_t __psint_t;
-typedef __uint32_t __psunsigned_t;
-#elif (BITS_PER_LONG == 64)
-typedef __int64_t __psint_t;
-typedef __uint64_t __psunsigned_t;
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-
 #include "xfs_types.h"
 
 #include "kmem.h"
index bcc7cfa..08d4fe4 100644 (file)
@@ -109,7 +109,7 @@ xlog_ungrant_log_space(
 STATIC void
 xlog_verify_dest_ptr(
        struct xlog             *log,
-       char                    *ptr);
+       void                    *ptr);
 STATIC void
 xlog_verify_grant_tail(
        struct xlog *log);
@@ -513,7 +513,7 @@ xfs_log_done(
        struct xfs_mount        *mp,
        struct xlog_ticket      *ticket,
        struct xlog_in_core     **iclog,
-       uint                    flags)
+       bool                    regrant)
 {
        struct xlog             *log = mp->m_log;
        xfs_lsn_t               lsn = 0;
@@ -526,14 +526,11 @@ xfs_log_done(
            (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
             (xlog_commit_record(log, ticket, iclog, &lsn)))) {
                lsn = (xfs_lsn_t) -1;
-               if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
-                       flags |= XFS_LOG_REL_PERM_RESERV;
-               }
+               regrant = false;
        }
 
 
-       if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
-           (flags & XFS_LOG_REL_PERM_RESERV)) {
+       if (!regrant) {
                trace_xfs_log_done_nonperm(log, ticket);
 
                /*
@@ -541,7 +538,6 @@ xfs_log_done(
                 * request has been made to release a permanent reservation.
                 */
                xlog_ungrant_log_space(log, ticket);
-               xfs_log_ticket_put(ticket);
        } else {
                trace_xfs_log_done_perm(log, ticket);
 
@@ -553,6 +549,7 @@ xfs_log_done(
                ticket->t_flags |= XLOG_TIC_INITED;
        }
 
+       xfs_log_ticket_put(ticket);
        return lsn;
 }
 
@@ -1447,7 +1444,7 @@ xlog_alloc_log(
                iclog->ic_bp = bp;
                iclog->ic_data = bp->b_addr;
 #ifdef DEBUG
-               log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+               log->l_iclog_bak[i] = &iclog->ic_header;
 #endif
                head = &iclog->ic_header;
                memset(head, 0, sizeof(xlog_rec_header_t));
@@ -1602,7 +1599,7 @@ xlog_pack_data(
        int                     i, j, k;
        int                     size = iclog->ic_offset + roundoff;
        __be32                  cycle_lsn;
-       xfs_caddr_t             dp;
+       char                    *dp;
 
        cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
 
@@ -3664,7 +3661,7 @@ xlog_ticket_alloc(
 void
 xlog_verify_dest_ptr(
        struct xlog     *log,
-       char            *ptr)
+       void            *ptr)
 {
        int i;
        int good_ptr = 0;
@@ -3767,9 +3764,8 @@ xlog_verify_iclog(
        xlog_op_header_t        *ophead;
        xlog_in_core_t          *icptr;
        xlog_in_core_2_t        *xhdr;
-       xfs_caddr_t             ptr;
-       xfs_caddr_t             base_ptr;
-       __psint_t               field_offset;
+       void                    *base_ptr, *ptr, *p;
+       ptrdiff_t               field_offset;
        __uint8_t               clientid;
        int                     len, i, j, k, op_len;
        int                     idx;
@@ -3788,9 +3784,9 @@ xlog_verify_iclog(
        if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
                xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
 
-       ptr = (xfs_caddr_t) &iclog->ic_header;
-       for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
-            ptr += BBSIZE) {
+       base_ptr = ptr = &iclog->ic_header;
+       p = &iclog->ic_header;
+       for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) {
                if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
                        xfs_emerg(log->l_mp, "%s: unexpected magic num",
                                __func__);
@@ -3798,20 +3794,19 @@ xlog_verify_iclog(
 
        /* check fields */
        len = be32_to_cpu(iclog->ic_header.h_num_logops);
-       ptr = iclog->ic_datap;
-       base_ptr = ptr;
-       ophead = (xlog_op_header_t *)ptr;
+       base_ptr = ptr = iclog->ic_datap;
+       ophead = ptr;
        xhdr = iclog->ic_data;
        for (i = 0; i < len; i++) {
-               ophead = (xlog_op_header_t *)ptr;
+               ophead = ptr;
 
                /* clientid is only 1 byte */
-               field_offset = (__psint_t)
-                              ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
+               p = &ophead->oh_clientid;
+               field_offset = p - base_ptr;
                if (!syncing || (field_offset & 0x1ff)) {
                        clientid = ophead->oh_clientid;
                } else {
-                       idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
+                       idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
                        if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
                                j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                                k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3829,13 +3824,13 @@ xlog_verify_iclog(
                                (unsigned long)field_offset);
 
                /* check length */
-               field_offset = (__psint_t)
-                              ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
+               p = &ophead->oh_len;
+               field_offset = p - base_ptr;
                if (!syncing || (field_offset & 0x1ff)) {
                        op_len = be32_to_cpu(ophead->oh_len);
                } else {
-                       idx = BTOBBT((__psint_t)&ophead->oh_len -
-                                   (__psint_t)iclog->ic_datap);
+                       idx = BTOBBT((uintptr_t)&ophead->oh_len -
+                                   (uintptr_t)iclog->ic_datap);
                        if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
                                j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                                k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
index 84e0deb..fa27aae 100644 (file)
@@ -110,15 +110,6 @@ static inline xfs_lsn_t    _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 
 #define        XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
 
-/*
- * Macros, structures, prototypes for interface to the log manager.
- */
-
-/*
- * Flags to xfs_log_done()
- */
-#define XFS_LOG_REL_PERM_RESERV        0x1
-
 /*
  * Flags to xfs_log_force()
  *
@@ -138,7 +129,7 @@ struct xfs_log_callback;
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
                       struct xlog_ticket *ticket,
                       struct xlog_in_core **iclog,
-                      uint             flags);
+                      bool regrant);
 int      _xfs_log_force(struct xfs_mount *mp,
                         uint           flags,
                         int            *log_forced);
@@ -183,7 +174,7 @@ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void     xfs_log_ticket_put(struct xlog_ticket *ticket);
 
 void   xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
-                               xfs_lsn_t *commit_lsn, int flags);
+                               xfs_lsn_t *commit_lsn, bool regrant);
 bool   xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 
 void   xfs_log_work_queue(struct xfs_mount *mp);
index 45cc0ce..abc2ccb 100644 (file)
@@ -624,7 +624,7 @@ restart:
        spin_unlock(&cil->xc_push_lock);
 
        /* xfs_log_done always frees the ticket on error. */
-       commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+       commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
        if (commit_lsn == -1)
                goto out_abort;
 
@@ -773,14 +773,10 @@ xfs_log_commit_cil(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
        xfs_lsn_t               *commit_lsn,
-       int                     flags)
+       bool                    regrant)
 {
        struct xlog             *log = mp->m_log;
        struct xfs_cil          *cil = log->l_cilp;
-       int                     log_flags = 0;
-
-       if (flags & XFS_TRANS_RELEASE_LOG_RES)
-               log_flags = XFS_LOG_REL_PERM_RESERV;
 
        /* lock out background commit */
        down_read(&cil->xc_ctx_lock);
@@ -795,7 +791,7 @@ xfs_log_commit_cil(
        if (commit_lsn)
                *commit_lsn = tp->t_commit_lsn;
 
-       xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+       xfs_log_done(mp, tp->t_ticket, NULL, regrant);
        xfs_trans_unreserve_and_mod_sb(tp);
 
        /*
@@ -809,7 +805,7 @@ xfs_log_commit_cil(
         * the log items. This affects (at least) processing of stale buffers,
         * inodes and EFIs.
         */
-       xfs_trans_free_items(tp, tp->t_commit_lsn, 0);
+       xfs_trans_free_items(tp, tp->t_commit_lsn, false);
 
        xlog_cil_push_background(log);
 
index db7cbde..1c87c8a 100644 (file)
@@ -409,7 +409,7 @@ struct xlog {
 
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
-       char                    *l_iclog_bak[XLOG_MAX_ICLOGS];
+       void                    *l_iclog_bak[XLOG_MAX_ICLOGS];
 #endif
 
 };
index 4f5784f..01dd228 100644 (file)
@@ -147,7 +147,7 @@ xlog_put_bp(
  * Return the address of the start of the given block number's data
  * in a log buffer.  The buffer covers a log sector-aligned region.
  */
-STATIC xfs_caddr_t
+STATIC char *
 xlog_align(
        struct xlog     *log,
        xfs_daddr_t     blk_no,
@@ -203,7 +203,7 @@ xlog_bread(
        xfs_daddr_t     blk_no,
        int             nbblks,
        struct xfs_buf  *bp,
-       xfs_caddr_t     *offset)
+       char            **offset)
 {
        int             error;
 
@@ -225,9 +225,9 @@ xlog_bread_offset(
        xfs_daddr_t     blk_no,         /* block to read from */
        int             nbblks,         /* blocks to read */
        struct xfs_buf  *bp,
-       xfs_caddr_t     offset)
+       char            *offset)
 {
-       xfs_caddr_t     orig_offset = bp->b_addr;
+       char            *orig_offset = bp->b_addr;
        int             orig_len = BBTOB(bp->b_length);
        int             error, error2;
 
@@ -396,7 +396,7 @@ xlog_find_cycle_start(
        xfs_daddr_t     *last_blk,
        uint            cycle)
 {
-       xfs_caddr_t     offset;
+       char            *offset;
        xfs_daddr_t     mid_blk;
        xfs_daddr_t     end_blk;
        uint            mid_cycle;
@@ -443,7 +443,7 @@ xlog_find_verify_cycle(
        uint            cycle;
        xfs_buf_t       *bp;
        xfs_daddr_t     bufblks;
-       xfs_caddr_t     buf = NULL;
+       char            *buf = NULL;
        int             error = 0;
 
        /*
@@ -509,7 +509,7 @@ xlog_find_verify_log_record(
 {
        xfs_daddr_t             i;
        xfs_buf_t               *bp;
-       xfs_caddr_t             offset = NULL;
+       char                    *offset = NULL;
        xlog_rec_header_t       *head = NULL;
        int                     error = 0;
        int                     smallmem = 0;
@@ -616,7 +616,7 @@ xlog_find_head(
        xfs_daddr_t     *return_head_blk)
 {
        xfs_buf_t       *bp;
-       xfs_caddr_t     offset;
+       char            *offset;
        xfs_daddr_t     new_blk, first_blk, start_blk, last_blk, head_blk;
        int             num_scan_bblks;
        uint            first_half_cycle, last_half_cycle;
@@ -891,7 +891,7 @@ xlog_find_tail(
 {
        xlog_rec_header_t       *rhead;
        xlog_op_header_t        *op_head;
-       xfs_caddr_t             offset = NULL;
+       char                    *offset = NULL;
        xfs_buf_t               *bp;
        int                     error, i, found;
        xfs_daddr_t             umount_data_blk;
@@ -1099,7 +1099,7 @@ xlog_find_zeroed(
        xfs_daddr_t     *blk_no)
 {
        xfs_buf_t       *bp;
-       xfs_caddr_t     offset;
+       char            *offset;
        uint            first_cycle, last_cycle;
        xfs_daddr_t     new_blk, last_blk, start_blk;
        xfs_daddr_t     num_scan_bblks;
@@ -1199,7 +1199,7 @@ bp_err:
 STATIC void
 xlog_add_record(
        struct xlog             *log,
-       xfs_caddr_t             buf,
+       char                    *buf,
        int                     cycle,
        int                     block,
        int                     tail_cycle,
@@ -1227,7 +1227,7 @@ xlog_write_log_records(
        int             tail_cycle,
        int             tail_block)
 {
-       xfs_caddr_t     offset;
+       char            *offset;
        xfs_buf_t       *bp;
        int             balign, ealign;
        int             sectbb = log->l_sectBBsize;
@@ -1789,8 +1789,7 @@ xlog_recover_do_inode_buffer(
                        return -EFSCORRUPTED;
                }
 
-               buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
-                                             next_unlinked_offset);
+               buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
                *buffer_nextp = *logged_nextp;
 
                /*
@@ -1798,7 +1797,7 @@ xlog_recover_do_inode_buffer(
                 * have to leave the inode in a consistent state for whoever
                 * reads it next....
                 */
-               xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+               xfs_dinode_calc_crc(mp,
                                xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
 
        }
@@ -2503,8 +2502,8 @@ xlog_recover_inode_pass2(
        xfs_buf_t               *bp;
        xfs_dinode_t            *dip;
        int                     len;
-       xfs_caddr_t             src;
-       xfs_caddr_t             dest;
+       char                    *src;
+       char                    *dest;
        int                     error;
        int                     attr_index;
        uint                    fields;
@@ -2546,7 +2545,7 @@ xlog_recover_inode_pass2(
                goto out_release;
        }
        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
-       dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
+       dip = xfs_buf_offset(bp, in_f->ilf_boffset);
 
        /*
         * Make sure the place we're flushing out to really looks
@@ -2885,7 +2884,7 @@ xlog_recover_dquot_pass2(
                return error;
 
        ASSERT(bp);
-       ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
+       ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
 
        /*
         * If the dquot has an LSN in it, recover the dquot only if it's less
@@ -3068,12 +3067,22 @@ xlog_recover_do_icreate_pass2(
                return -EINVAL;
        }
 
-       /* existing allocation is fixed value */
-       ASSERT(count == mp->m_ialloc_inos);
-       ASSERT(length == mp->m_ialloc_blks);
-       if (count != mp->m_ialloc_inos ||
-            length != mp->m_ialloc_blks) {
-               xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+       /*
+        * The inode chunk is either full or sparse and we only support
+        * m_ialloc_min_blks sized sparse allocations at this time.
+        */
+       if (length != mp->m_ialloc_blks &&
+           length != mp->m_ialloc_min_blks) {
+               xfs_warn(log->l_mp,
+                        "%s: unsupported chunk length", __FUNCTION__);
+               return -EINVAL;
+       }
+
+       /* verify inode count is consistent with extent length */
+       if ((count >> mp->m_sb.sb_inopblog) != length) {
+               xfs_warn(log->l_mp,
+                        "%s: inconsistent inode count and chunk length",
+                        __FUNCTION__);
                return -EINVAL;
        }
 
@@ -3091,8 +3100,8 @@ xlog_recover_do_icreate_pass2(
                        XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
                return 0;
 
-       xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
-                                       be32_to_cpu(icl->icl_gen));
+       xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length,
+                             be32_to_cpu(icl->icl_gen));
        return 0;
 }
 
@@ -3364,17 +3373,17 @@ STATIC int
 xlog_recover_add_to_cont_trans(
        struct xlog             *log,
        struct xlog_recover     *trans,
-       xfs_caddr_t             dp,
+       char                    *dp,
        int                     len)
 {
        xlog_recover_item_t     *item;
-       xfs_caddr_t             ptr, old_ptr;
+       char                    *ptr, *old_ptr;
        int                     old_len;
 
        if (list_empty(&trans->r_itemq)) {
                /* finish copying rest of trans header */
                xlog_recover_add_item(&trans->r_itemq);
-               ptr = (xfs_caddr_t) &trans->r_theader +
+               ptr = (char *)&trans->r_theader +
                                sizeof(xfs_trans_header_t) - len;
                memcpy(ptr, dp, len);
                return 0;
@@ -3410,12 +3419,12 @@ STATIC int
 xlog_recover_add_to_trans(
        struct xlog             *log,
        struct xlog_recover     *trans,
-       xfs_caddr_t             dp,
+       char                    *dp,
        int                     len)
 {
        xfs_inode_log_format_t  *in_f;                  /* any will do */
        xlog_recover_item_t     *item;
-       xfs_caddr_t             ptr;
+       char                    *ptr;
 
        if (!len)
                return 0;
@@ -3504,7 +3513,7 @@ STATIC int
 xlog_recovery_process_trans(
        struct xlog             *log,
        struct xlog_recover     *trans,
-       xfs_caddr_t             dp,
+       char                    *dp,
        unsigned int            len,
        unsigned int            flags,
        int                     pass)
@@ -3611,8 +3620,8 @@ xlog_recover_process_ophdr(
        struct hlist_head       rhash[],
        struct xlog_rec_header  *rhead,
        struct xlog_op_header   *ohead,
-       xfs_caddr_t             dp,
-       xfs_caddr_t             end,
+       char                    *dp,
+       char                    *end,
        int                     pass)
 {
        struct xlog_recover     *trans;
@@ -3661,11 +3670,11 @@ xlog_recover_process_data(
        struct xlog             *log,
        struct hlist_head       rhash[],
        struct xlog_rec_header  *rhead,
-       xfs_caddr_t             dp,
+       char                    *dp,
        int                     pass)
 {
        struct xlog_op_header   *ohead;
-       xfs_caddr_t             end;
+       char                    *end;
        int                     num_logops;
        int                     error;
 
@@ -3751,11 +3760,11 @@ xlog_recover_process_efi(
        }
 
        set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
        return error;
 
 abort_error:
-       xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
        return error;
 }
 
@@ -3857,13 +3866,13 @@ xlog_recover_clear_agi_bucket(
        xfs_trans_log_buf(tp, agibp, offset,
                          (offset + sizeof(xfs_agino_t) - 1));
 
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
        if (error)
                goto out_error;
        return;
 
 out_abort:
-       xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
 out_error:
        xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
        return;
@@ -4010,7 +4019,7 @@ xlog_recover_process_iunlinks(
 STATIC int
 xlog_unpack_data_crc(
        struct xlog_rec_header  *rhead,
-       xfs_caddr_t             dp,
+       char                    *dp,
        struct xlog             *log)
 {
        __le32                  crc;
@@ -4040,7 +4049,7 @@ xlog_unpack_data_crc(
 STATIC int
 xlog_unpack_data(
        struct xlog_rec_header  *rhead,
-       xfs_caddr_t             dp,
+       char                    *dp,
        struct xlog             *log)
 {
        int                     i, j, k;
@@ -4122,7 +4131,7 @@ xlog_do_recovery_pass(
 {
        xlog_rec_header_t       *rhead;
        xfs_daddr_t             blk_no;
-       xfs_caddr_t             offset;
+       char                    *offset;
        xfs_buf_t               *hbp, *dbp;
        int                     error = 0, h_size;
        int                     bblks, split_bblks;
index 6f23fbd..461e791 100644 (file)
@@ -724,6 +724,22 @@ xfs_mountfs(
                        mp->m_inode_cluster_size = new_size;
        }
 
+       /*
+        * If enabled, sparse inode chunk alignment is expected to match the
+        * cluster size. Full inode chunk alignment must match the chunk size,
+        * but that is checked on sb read verification...
+        */
+       if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
+           mp->m_sb.sb_spino_align !=
+                       XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
+               xfs_warn(mp,
+       "Sparse inode block alignment (%u) must match cluster size (%llu).",
+                        mp->m_sb.sb_spino_align,
+                        XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
+               error = -EINVAL;
+               goto out_remove_uuid;
+       }
+
        /*
         * Set inode alignment fields
         */
index 8c995a2..7999e91 100644 (file)
@@ -101,6 +101,8 @@ typedef struct xfs_mount {
        __uint64_t              m_flags;        /* global mount flags */
        int                     m_ialloc_inos;  /* inodes in inode allocation */
        int                     m_ialloc_blks;  /* blocks in inode allocation */
+       int                     m_ialloc_min_blks;/* min blocks in sparse inode
+                                                  * allocation */
        int                     m_inoalign_mask;/* mask sb_inoalignmt if used */
        uint                    m_qflags;       /* quota status flags */
        struct xfs_trans_resv   m_resv;         /* precomputed res values */
@@ -179,6 +181,8 @@ typedef struct xfs_mount {
                                                   allocator */
 #define XFS_MOUNT_NOATTR2      (1ULL << 25)    /* disable use of attr2 format */
 
+#define XFS_MOUNT_DAX          (1ULL << 62)    /* TEST ONLY! */
+
 
 /*
  * Default minimum read and write sizes.
index 981a657..ab4a606 100644 (file)
@@ -306,7 +306,7 @@ xfs_fs_commit_blocks(
        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                goto out_drop_iolock;
        }
 
@@ -321,7 +321,7 @@ xfs_fs_commit_blocks(
        }
 
        xfs_trans_set_sync(tp);
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
 
 out_drop_iolock:
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
index 5538468..eac9549 100644 (file)
@@ -756,7 +756,7 @@ xfs_qm_qino_alloc(
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
                                  XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -764,8 +764,7 @@ xfs_qm_qino_alloc(
                error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
                                                                &committed);
                if (error) {
-                       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-                                        XFS_TRANS_ABORT);
+                       xfs_trans_cancel(tp);
                        return error;
                }
        }
@@ -796,7 +795,7 @@ xfs_qm_qino_alloc(
        spin_unlock(&mp->m_sb_lock);
        xfs_log_sb(tp);
 
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
        if (error) {
                ASSERT(XFS_FORCED_SHUTDOWN(mp));
                xfs_alert(mp, "%s failed (error %d)!", __func__, error);
index 9a25c92..3640c6e 100644 (file)
@@ -239,7 +239,7 @@ xfs_qm_scall_trunc_qfile(
        tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
                goto out_put;
        }
@@ -252,15 +252,14 @@ xfs_qm_scall_trunc_qfile(
 
        error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
        if (error) {
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-                                    XFS_TRANS_ABORT);
+               xfs_trans_cancel(tp);
                goto out_unlock;
        }
 
        ASSERT(ip->i_d.di_nextents == 0);
 
        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
 
 out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
@@ -437,7 +436,7 @@ xfs_qm_scall_setqlim(
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                goto out_rele;
        }
 
@@ -548,7 +547,7 @@ xfs_qm_scall_setqlim(
        dqp->dq_flags |= XFS_DQ_DIRTY;
        xfs_trans_log_dquot(tp, dqp);
 
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
 
 out_rele:
        xfs_qm_dqrele(dqp);
@@ -571,7 +570,7 @@ xfs_qm_log_quotaoff_end(
 
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -585,8 +584,7 @@ xfs_qm_log_quotaoff_end(
         * We don't care about quotoff's performance.
         */
        xfs_trans_set_sync(tp);
-       error = xfs_trans_commit(tp, 0);
-       return error;
+       return xfs_trans_commit(tp);
 }
 
 
@@ -605,7 +603,7 @@ xfs_qm_log_quotaoff(
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                goto out;
        }
 
@@ -624,7 +622,7 @@ xfs_qm_log_quotaoff(
         * We don't care about quotoff's performance.
         */
        xfs_trans_set_sync(tp);
-       error = xfs_trans_commit(tp, 0);
+       error = xfs_trans_commit(tp);
        if (error)
                goto out;
 
index 5376dd4..ce6506a 100644 (file)
@@ -55,7 +55,6 @@ struct xfs_trans;
 typedef struct xfs_dqtrx {
        struct xfs_dquot *qt_dquot;       /* the dquot this refers to */
        ulong           qt_blk_res;       /* blks reserved on a dquot */
-       ulong           qt_blk_res_used;  /* blks used from the reservation */
        ulong           qt_ino_res;       /* inode reserved on a dquot */
        ulong           qt_ino_res_used;  /* inodes used from the reservation */
        long            qt_bcount_delta;  /* dquot blk count changes */
index f2079b6..f4e8c06 100644 (file)
@@ -780,7 +780,6 @@ xfs_growfs_rt_alloc(
         * Allocate space to the file, as necessary.
         */
        while (oblocks < nblocks) {
-               int             cancelflags = 0;
                xfs_trans_t     *tp;
 
                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
@@ -792,7 +791,6 @@ xfs_growfs_rt_alloc(
                                          resblks, 0);
                if (error)
                        goto error_cancel;
-               cancelflags = XFS_TRANS_RELEASE_LOG_RES;
                /*
                 * Lock the inode.
                 */
@@ -804,7 +802,6 @@ xfs_growfs_rt_alloc(
                 * Allocate blocks to the bitmap file.
                 */
                nmap = 1;
-               cancelflags |= XFS_TRANS_ABORT;
                error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
                                        XFS_BMAPI_METADATA, &firstblock,
                                        resblks, &map, &nmap, &flist);
@@ -818,14 +815,13 @@ xfs_growfs_rt_alloc(
                error = xfs_bmap_finish(&tp, &flist, &committed);
                if (error)
                        goto error_cancel;
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               error = xfs_trans_commit(tp);
                if (error)
                        goto error;
                /*
                 * Now we need to clear the allocated blocks.
                 * Do this one block per transaction, to keep it simple.
                 */
-               cancelflags = 0;
                for (bno = map.br_startoff, fsbno = map.br_startblock;
                     bno < map.br_startoff + map.br_blockcount;
                     bno++, fsbno++) {
@@ -851,7 +847,7 @@ xfs_growfs_rt_alloc(
                        if (bp == NULL) {
                                error = -EIO;
 error_cancel:
-                               xfs_trans_cancel(tp, cancelflags);
+                               xfs_trans_cancel(tp);
                                goto error;
                        }
                        memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
@@ -859,7 +855,7 @@ error_cancel:
                        /*
                         * Commit the transaction.
                         */
-                       error = xfs_trans_commit(tp, 0);
+                       error = xfs_trans_commit(tp);
                        if (error)
                                goto error;
                }
@@ -973,7 +969,6 @@ xfs_growfs_rt(
             bmbno < nrbmblocks;
             bmbno++) {
                xfs_trans_t     *tp;
-               int             cancelflags = 0;
 
                *nmp = *mp;
                nsbp = &nmp->m_sb;
@@ -1015,7 +1010,6 @@ xfs_growfs_rt(
                mp->m_rbmip->i_d.di_size =
                        nsbp->sb_rbmblocks * nsbp->sb_blocksize;
                xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
-               cancelflags |= XFS_TRANS_ABORT;
                /*
                 * Get the summary inode into the transaction.
                 */
@@ -1062,7 +1056,7 @@ xfs_growfs_rt(
                        nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
                if (error) {
 error_cancel:
-                       xfs_trans_cancel(tp, cancelflags);
+                       xfs_trans_cancel(tp);
                        break;
                }
                /*
@@ -1076,7 +1070,7 @@ error_cancel:
                mp->m_rsumlevels = nrsumlevels;
                mp->m_rsumsize = nrsumsize;
 
-               error = xfs_trans_commit(tp, 0);
+               error = xfs_trans_commit(tp);
                if (error)
                        break;
        }
index 858e1e6..1fb1656 100644 (file)
@@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj;        /* global debug sysfs attrs */
 #define MNTOPT_DISCARD    "discard"    /* Discard unused blocks */
 #define MNTOPT_NODISCARD   "nodiscard" /* Do not discard unused blocks */
 
+#define MNTOPT_DAX     "dax"           /* Enable direct access to bdev pages */
+
 /*
  * Table driven mount option parser.
  *
@@ -363,6 +365,10 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_DISCARD;
                } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
                        mp->m_flags &= ~XFS_MOUNT_DISCARD;
+#ifdef CONFIG_FS_DAX
+               } else if (!strcmp(this_char, MNTOPT_DAX)) {
+                       mp->m_flags |= XFS_MOUNT_DAX;
+#endif
                } else {
                        xfs_warn(mp, "unknown mount option [%s].", this_char);
                        return -EINVAL;
@@ -452,8 +458,8 @@ done:
 }
 
 struct proc_xfs_info {
-       int     flag;
-       char    *str;
+       uint64_t        flag;
+       char            *str;
 };
 
 STATIC int
@@ -474,6 +480,7 @@ xfs_showargs(
                { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
                { XFS_MOUNT_DISCARD,            "," MNTOPT_DISCARD },
                { XFS_MOUNT_SMALL_INUMS,        "," MNTOPT_32BITINODE },
+               { XFS_MOUNT_DAX,                "," MNTOPT_DAX },
                { 0, NULL }
        };
        static struct proc_xfs_info xfs_info_unset[] = {
@@ -1507,6 +1514,20 @@ xfs_fs_fill_super(
        if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
                sb->s_flags |= MS_I_VERSION;
 
+       if (mp->m_flags & XFS_MOUNT_DAX) {
+               xfs_warn(mp,
+       "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+               if (sb->s_blocksize != PAGE_SIZE) {
+                       xfs_alert(mp,
+               "Filesystem block size invalid for DAX Turning DAX off.");
+                       mp->m_flags &= ~XFS_MOUNT_DAX;
+               } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
+                       xfs_alert(mp,
+               "Block device does not support DAX Turning DAX off.");
+                       mp->m_flags &= ~XFS_MOUNT_DAX;
+               }
+       }
+
        error = xfs_mountfs(mp);
        if (error)
                goto out_filestream_unmount;
index 3df411e..4be27b0 100644 (file)
@@ -104,7 +104,7 @@ xfs_readlink_bmap(
                        cur_chunk += sizeof(struct xfs_dsymlink_hdr);
                }
 
-               memcpy(link + offset, bp->b_addr, byte_cnt);
+               memcpy(link + offset, cur_chunk, byte_cnt);
 
                pathlen -= byte_cnt;
                offset += byte_cnt;
@@ -178,7 +178,6 @@ xfs_symlink(
        struct xfs_bmap_free    free_list;
        xfs_fsblock_t           first_block;
        bool                    unlock_dp_on_error = false;
-       uint                    cancel_flags;
        int                     committed;
        xfs_fileoff_t           first_fsb;
        xfs_filblks_t           fs_blocks;
@@ -224,7 +223,6 @@ xfs_symlink(
                return error;
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        /*
         * The symlink will fit into the inode data fork?
         * There can't be any attributes so we get the whole variable part.
@@ -239,10 +237,8 @@ xfs_symlink(
                resblks = 0;
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
        }
-       if (error) {
-               cancel_flags = 0;
+       if (error)
                goto out_trans_cancel;
-       }
 
        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
        unlock_dp_on_error = true;
@@ -394,7 +390,7 @@ xfs_symlink(
        if (error)
                goto out_bmap_cancel;
 
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
        if (error)
                goto out_release_inode;
 
@@ -407,9 +403,8 @@ xfs_symlink(
 
 out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
-       cancel_flags |= XFS_TRANS_ABORT;
 out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
+       xfs_trans_cancel(tp);
 out_release_inode:
        /*
         * Wait until after the current transaction is aborted to finish the
@@ -464,7 +459,7 @@ xfs_inactive_symlink_rmt(
        tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -533,7 +528,7 @@ xfs_inactive_symlink_rmt(
        /*
         * Commit the transaction containing extent freeing and EFDs.
         */
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       error = xfs_trans_commit(tp);
        if (error) {
                ASSERT(XFS_FORCED_SHUTDOWN(mp));
                goto error_unlock;
@@ -552,7 +547,7 @@ xfs_inactive_symlink_rmt(
 error_bmap_cancel:
        xfs_bmap_cancel(&free_list);
 error_trans_cancel:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_trans_cancel(tp);
 error_unlock:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
index 615781b..8d916d3 100644 (file)
@@ -738,6 +738,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size,
                  __entry->blocks, __entry->shift, __entry->writeio_blocks)
 )
 
+TRACE_EVENT(xfs_irec_merge_pre,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+                uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask),
+       TP_ARGS(mp, agno, agino, holemask, nagino, nholemask),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agino_t, agino)
+               __field(uint16_t, holemask)
+               __field(xfs_agino_t, nagino)
+               __field(uint16_t, nholemask)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->agino = agino;
+               __entry->holemask = holemask;
+               __entry->nagino = nagino;
+               __entry->nholemask = holemask;
+       ),
+       TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+                 __entry->agino, __entry->holemask, __entry->nagino,
+                 __entry->nholemask)
+)
+
+TRACE_EVENT(xfs_irec_merge_post,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+                uint16_t holemask),
+       TP_ARGS(mp, agno, agino, holemask),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agino_t, agino)
+               __field(uint16_t, holemask)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->agino = agino;
+               __entry->holemask = holemask;
+       ),
+       TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev),
+                 MINOR(__entry->dev), __entry->agno, __entry->agino,
+                 __entry->holemask)
+)
+
 #define DEFINE_IREF_EVENT(name) \
 DEFINE_EVENT(xfs_iref_class, name, \
        TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
index 220ef2c..0582a27 100644 (file)
@@ -113,7 +113,7 @@ xfs_trans_free(
  * blocks.  Locks and log items, however, are no inherited.  They must
  * be added to the new transaction explicitly.
  */
-xfs_trans_t *
+STATIC xfs_trans_t *
 xfs_trans_dup(
        xfs_trans_t     *tp)
 {
@@ -251,14 +251,7 @@ xfs_trans_reserve(
         */
 undo_log:
        if (resp->tr_logres > 0) {
-               int             log_flags;
-
-               if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
-                       log_flags = XFS_LOG_REL_PERM_RESERV;
-               } else {
-                       log_flags = 0;
-               }
-               xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
+               xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
                tp->t_ticket = NULL;
                tp->t_log_res = 0;
                tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
@@ -744,7 +737,7 @@ void
 xfs_trans_free_items(
        struct xfs_trans        *tp,
        xfs_lsn_t               commit_lsn,
-       int                     flags)
+       bool                    abort)
 {
        struct xfs_log_item_desc *lidp, *next;
 
@@ -755,7 +748,7 @@ xfs_trans_free_items(
 
                if (commit_lsn != NULLCOMMITLSN)
                        lip->li_ops->iop_committing(lip, commit_lsn);
-               if (flags & XFS_TRANS_ABORT)
+               if (abort)
                        lip->li_flags |= XFS_LI_ABORTED;
                lip->li_ops->iop_unlock(lip);
 
@@ -892,26 +885,16 @@ xfs_trans_committed_bulk(
  * have already been unlocked as if the commit had succeeded.
  * Do not reference the transaction structure after this call.
  */
-int
-xfs_trans_commit(
+static int
+__xfs_trans_commit(
        struct xfs_trans        *tp,
-       uint                    flags)
+       bool                    regrant)
 {
        struct xfs_mount        *mp = tp->t_mountp;
        xfs_lsn_t               commit_lsn = -1;
        int                     error = 0;
-       int                     log_flags = 0;
        int                     sync = tp->t_flags & XFS_TRANS_SYNC;
 
-       /*
-        * Determine whether this commit is releasing a permanent
-        * log reservation or not.
-        */
-       if (flags & XFS_TRANS_RELEASE_LOG_RES) {
-               ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
-               log_flags = XFS_LOG_REL_PERM_RESERV;
-       }
-
        /*
         * If there is nothing to be logged by the transaction,
         * then unlock all of the items associated with the
@@ -936,7 +919,7 @@ xfs_trans_commit(
                xfs_trans_apply_sb_deltas(tp);
        xfs_trans_apply_dquot_deltas(tp);
 
-       xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
+       xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
 
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
        xfs_trans_free(tp);
@@ -964,18 +947,25 @@ out_unreserve:
         */
        xfs_trans_unreserve_and_mod_dquots(tp);
        if (tp->t_ticket) {
-               commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+               commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
                if (commit_lsn == -1 && !error)
                        error = -EIO;
        }
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-       xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
+       xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
        xfs_trans_free(tp);
 
        XFS_STATS_INC(xs_trans_empty);
        return error;
 }
 
+int
+xfs_trans_commit(
+       struct xfs_trans        *tp)
+{
+       return __xfs_trans_commit(tp, false);
+}
+
 /*
  * Unlock all of the transaction's items and free the transaction.
  * The transaction must not have modified any of its items, because
@@ -986,29 +976,22 @@ out_unreserve:
  */
 void
 xfs_trans_cancel(
-       xfs_trans_t             *tp,
-       int                     flags)
+       struct xfs_trans        *tp)
 {
-       int                     log_flags;
-       xfs_mount_t             *mp = tp->t_mountp;
+       struct xfs_mount        *mp = tp->t_mountp;
+       bool                    dirty = (tp->t_flags & XFS_TRANS_DIRTY);
 
-       /*
-        * See if the caller is being too lazy to figure out if
-        * the transaction really needs an abort.
-        */
-       if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
-               flags &= ~XFS_TRANS_ABORT;
        /*
         * See if the caller is relying on us to shut down the
         * filesystem.  This happens in paths where we detect
         * corruption and decide to give up.
         */
-       if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
+       if (dirty && !XFS_FORCED_SHUTDOWN(mp)) {
                XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
        }
 #ifdef DEBUG
-       if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
+       if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) {
                struct xfs_log_item_desc *lidp;
 
                list_for_each_entry(lidp, &tp->t_items, lid_trans)
@@ -1018,27 +1001,20 @@ xfs_trans_cancel(
        xfs_trans_unreserve_and_mod_sb(tp);
        xfs_trans_unreserve_and_mod_dquots(tp);
 
-       if (tp->t_ticket) {
-               if (flags & XFS_TRANS_RELEASE_LOG_RES) {
-                       ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
-                       log_flags = XFS_LOG_REL_PERM_RESERV;
-               } else {
-                       log_flags = 0;
-               }
-               xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
-       }
+       if (tp->t_ticket)
+               xfs_log_done(mp, tp->t_ticket, NULL, false);
 
        /* mark this thread as no longer being in a transaction */
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
-       xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
+       xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
        xfs_trans_free(tp);
 }
 
 /*
  * Roll from one trans in the sequence of PERMANENT transactions to
  * the next: permanent transactions are only flushed out when
- * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * committed with xfs_trans_commit(), but we still want as soon
  * as possible to let chunks of it go to the log. So we commit the
  * chunk we've been working on and get a new transaction to continue.
  */
@@ -1055,7 +1031,8 @@ xfs_trans_roll(
         * Ensure that the inode is always logged.
         */
        trans = *tpp;
-       xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+       if (dp)
+               xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
 
        /*
         * Copy the critical parameters from one trans to the next.
@@ -1071,19 +1048,12 @@ xfs_trans_roll(
         * is in progress. The caller takes the responsibility to cancel
         * the duplicate transaction that gets returned.
         */
-       error = xfs_trans_commit(trans, 0);
+       error = __xfs_trans_commit(trans, true);
        if (error)
                return error;
 
        trans = *tpp;
 
-       /*
-        * transaction commit worked ok so we can drop the extra ticket
-        * reference that we gained in xfs_trans_dup()
-        */
-       xfs_log_ticket_put(trans->t_ticket);
-
-
        /*
         * Reserve space in the log for th next transaction.
         * This also pushes items in the "AIL", the list of logged items,
@@ -1100,6 +1070,7 @@ xfs_trans_roll(
        if (error)
                return error;
 
-       xfs_trans_ijoin(trans, dp, 0);
+       if (dp)
+               xfs_trans_ijoin(trans, dp, 0);
        return 0;
 }
index b5bc1ab..3b21b4e 100644 (file)
@@ -133,8 +133,6 @@ typedef struct xfs_trans {
  * XFS transaction mechanism exported interfaces that are
  * actually macros.
  */
-#define        xfs_trans_get_log_res(tp)       ((tp)->t_log_res)
-#define        xfs_trans_get_log_count(tp)     ((tp)->t_log_count)
 #define        xfs_trans_get_block_res(tp)     ((tp)->t_blk_res)
 #define        xfs_trans_set_sync(tp)          ((tp)->t_flags |= XFS_TRANS_SYNC)
 
@@ -153,7 +151,6 @@ typedef struct xfs_trans {
  */
 xfs_trans_t    *xfs_trans_alloc(struct xfs_mount *, uint);
 xfs_trans_t    *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
-xfs_trans_t    *xfs_trans_dup(xfs_trans_t *);
 int            xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
                                  uint, uint);
 void           xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
@@ -228,9 +225,9 @@ void                xfs_trans_log_efd_extent(xfs_trans_t *,
                                         struct xfs_efd_log_item *,
                                         xfs_fsblock_t,
                                         xfs_extlen_t);
-int            xfs_trans_commit(xfs_trans_t *, uint flags);
+int            xfs_trans_commit(struct xfs_trans *);
 int            xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
-void           xfs_trans_cancel(xfs_trans_t *, int);
+void           xfs_trans_cancel(xfs_trans_t *);
 int            xfs_trans_ail_init(struct xfs_mount *);
 void           xfs_trans_ail_destroy(struct xfs_mount *);
 
index 573aefb..1098cf4 100644 (file)
@@ -159,7 +159,7 @@ xfs_trans_ail_cursor_next(
 {
        struct xfs_log_item     *lip = cur->item;
 
-       if ((__psint_t)lip & 1)
+       if ((uintptr_t)lip & 1)
                lip = xfs_ail_min(ailp);
        if (lip)
                cur->item = xfs_ail_next(ailp, lip);
@@ -196,7 +196,7 @@ xfs_trans_ail_cursor_clear(
        list_for_each_entry(cur, &ailp->xa_cursors, list) {
                if (cur->item == lip)
                        cur->item = (struct xfs_log_item *)
-                                       ((__psint_t)cur->item | 1);
+                                       ((uintptr_t)cur->item | 1);
        }
 }
 
@@ -287,7 +287,7 @@ xfs_ail_splice(
         * find the place in the AIL where the items belong.
         */
        lip = cur ? cur->item : NULL;
-       if (!lip || (__psint_t) lip & 1)
+       if (!lip || (uintptr_t)lip & 1)
                lip = __xfs_trans_ail_cursor_last(ailp, lsn);
 
        /*
index 76a16df..ce78534 100644 (file)
@@ -90,8 +90,9 @@ xfs_trans_dup_dqinfo(
        xfs_trans_t     *ntp)
 {
        xfs_dqtrx_t     *oq, *nq;
-       int             i,j;
+       int             i, j;
        xfs_dqtrx_t     *oqa, *nqa;
+       ulong           blk_res_used;
 
        if (!otp->t_dqinfo)
                return;
@@ -102,18 +103,23 @@ xfs_trans_dup_dqinfo(
         * Because the quota blk reservation is carried forward,
         * it is also necessary to carry forward the DQ_DIRTY flag.
         */
-       if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
+       if (otp->t_flags & XFS_TRANS_DQ_DIRTY)
                ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
 
        for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
                oqa = otp->t_dqinfo->dqs[j];
                nqa = ntp->t_dqinfo->dqs[j];
                for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+                       blk_res_used = 0;
+
                        if (oqa[i].qt_dquot == NULL)
                                break;
                        oq = &oqa[i];
                        nq = &nqa[i];
 
+                       if (oq->qt_blk_res && oq->qt_bcount_delta > 0)
+                               blk_res_used = oq->qt_bcount_delta;
+
                        nq->qt_dquot = oq->qt_dquot;
                        nq->qt_bcount_delta = nq->qt_icount_delta = 0;
                        nq->qt_rtbcount_delta = 0;
@@ -121,8 +127,8 @@ xfs_trans_dup_dqinfo(
                        /*
                         * Transfer whatever is left of the reservations.
                         */
-                       nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
-                       oq->qt_blk_res = oq->qt_blk_res_used;
+                       nq->qt_blk_res = oq->qt_blk_res - blk_res_used;
+                       oq->qt_blk_res = blk_res_used;
 
                        nq->qt_rtblk_res = oq->qt_rtblk_res -
                                oq->qt_rtblk_res_used;
@@ -239,10 +245,6 @@ xfs_trans_mod_dquot(
                 * disk blocks used.
                 */
              case XFS_TRANS_DQ_BCOUNT:
-               if (qtrx->qt_blk_res && delta > 0) {
-                       qtrx->qt_blk_res_used += (ulong)delta;
-                       ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
-               }
                qtrx->qt_bcount_delta += delta;
                break;
 
@@ -423,15 +425,19 @@ xfs_trans_apply_dquot_deltas(
                         * reservation that a transaction structure knows of.
                         */
                        if (qtrx->qt_blk_res != 0) {
-                               if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
-                                       if (qtrx->qt_blk_res >
-                                           qtrx->qt_blk_res_used)
+                               ulong blk_res_used = 0;
+
+                               if (qtrx->qt_bcount_delta > 0)
+                                       blk_res_used = qtrx->qt_bcount_delta;
+
+                               if (qtrx->qt_blk_res != blk_res_used) {
+                                       if (qtrx->qt_blk_res > blk_res_used)
                                                dqp->q_res_bcount -= (xfs_qcnt_t)
                                                        (qtrx->qt_blk_res -
-                                                        qtrx->qt_blk_res_used);
+                                                        blk_res_used);
                                        else
                                                dqp->q_res_bcount -= (xfs_qcnt_t)
-                                                       (qtrx->qt_blk_res_used -
+                                                       (blk_res_used -
                                                         qtrx->qt_blk_res);
                                }
                        } else {
index bd12818..1b73629 100644 (file)
@@ -30,7 +30,7 @@ void  xfs_trans_init(struct xfs_mount *);
 void   xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void   xfs_trans_del_item(struct xfs_log_item *);
 void   xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
-                               int flags);
+                               bool abort);
 void   xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
 
 void   xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
index e351da4..3f1a846 100644 (file)
@@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create);
 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
                        ssize_t bytes, void *private);
+typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
 
 #define MAY_EXEC               0x00000001
 #define MAY_WRITE              0x00000002
@@ -2655,9 +2656,13 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
 int dax_clear_blocks(struct inode *, sector_t block, long size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+               dax_iodone_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+               dax_iodone_t);
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb)      dax_fault(vma, vmf, gb)
+#define dax_mkwrite(vma, vmf, gb, iod)         dax_fault(vma, vmf, gb, iod)
+#define __dax_mkwrite(vma, vmf, gb, iod)       __dax_fault(vma, vmf, gb, iod)
 
 #ifdef CONFIG_BLOCK
 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,