Merge branch 'for-linus-merged' of git://oss.sgi.com/xfs/xfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 11 Jan 2011 19:42:06 +0000 (11:42 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 11 Jan 2011 19:42:06 +0000 (11:42 -0800)
* 'for-linus-merged' of git://oss.sgi.com/xfs/xfs: (47 commits)
  xfs: convert grant head manipulations to lockless algorithm
  xfs: introduce new locks for the log grant ticket wait queues
  xfs: convert log grant heads to atomic variables
  xfs: convert l_tail_lsn to an atomic variable.
  xfs: convert l_last_sync_lsn to an atomic variable
  xfs: make AIL tail pushing independent of the grant lock
  xfs: use wait queues directly for the log wait queues
  xfs: combine grant heads into a single 64 bit integer
  xfs: rework log grant space calculations
  xfs: fact out common grant head/log tail verification code
  xfs: convert log grant ticket queues to list heads
  xfs: use AIL bulk delete function to implement single delete
  xfs: use AIL bulk update function to implement single updates
  xfs: remove all the inodes on a buffer from the AIL in bulk
  xfs: consume iodone callback items on buffers as they are processed
  xfs: reduce the number of AIL push wakeups
  xfs: bulk AIL insertion during transaction commit
  xfs: clean up xfs_ail_delete()
  xfs: Pull EFI/EFD handling out from under the AIL lock
  xfs: fix EFI transaction cancellation.
  ...

38 files changed:
fs/xfs/linux-2.6/sv.h [deleted file]
fs/xfs/linux-2.6/xfs_aops.c
fs/xfs/linux-2.6/xfs_aops.h
fs/xfs/linux-2.6/xfs_buf.c
fs/xfs/linux-2.6/xfs_buf.h
fs/xfs/linux-2.6/xfs_export.c
fs/xfs/linux-2.6/xfs_linux.h
fs/xfs/linux-2.6/xfs_super.c
fs/xfs/linux-2.6/xfs_sync.c
fs/xfs/linux-2.6/xfs_trace.h
fs/xfs/quota/xfs_dquot.c
fs/xfs/xfs_ag.h
fs/xfs/xfs_alloc.c
fs/xfs/xfs_attr_leaf.c
fs/xfs/xfs_btree.c
fs/xfs/xfs_buf_item.c
fs/xfs/xfs_buf_item.h
fs/xfs/xfs_extfree_item.c
fs/xfs/xfs_extfree_item.h
fs/xfs/xfs_fsops.c
fs/xfs/xfs_iget.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iomap.h
fs/xfs/xfs_log.c
fs/xfs/xfs_log_cil.c
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
fs/xfs/xfs_trans_ail.c
fs/xfs/xfs_trans_extfree.c
fs/xfs/xfs_trans_priv.h
fs/xfs/xfs_vnodeops.c

diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644 (file)
index 4dfc7c3..0000000
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-
-typedef struct sv_s {
-       wait_queue_head_t waiters;
-} sv_t;
-
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-{
-       DECLARE_WAITQUEUE(wait, current);
-
-       add_wait_queue_exclusive(&sv->waiters, &wait);
-       __set_current_state(TASK_UNINTERRUPTIBLE);
-       spin_unlock(lock);
-
-       schedule();
-
-       remove_wait_queue(&sv->waiters, &wait);
-}
-
-#define sv_init(sv,flag,name) \
-       init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
-       /*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
-       _sv_wait(sv, lock)
-#define sv_signal(sv) \
-       wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
-       wake_up_all(&(sv)->waiters)
-
-#endif /* __XFS_SUPPORT_SV_H__ */
index 691f612..ec7bbb5 100644 (file)
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
-       IO_READ,        /* mapping for a read */
-       IO_DELAY,       /* mapping covers delalloc region */
-       IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
-       IO_NEW          /* just allocated */
-};
 
 /*
  * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
        xfs_inode_t             *ip = XFS_I(ioend->io_inode);
        xfs_fsize_t             isize;
 
-       ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-       ASSERT(ioend->io_type != IO_READ);
-
        if (unlikely(ioend->io_error))
                return 0;
 
@@ -244,10 +232,8 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-       if (ioend->io_type != IO_READ) {
-               error = xfs_setfilesize(ioend);
-               ASSERT(!error || error == EAGAIN);
-       }
+       error = xfs_setfilesize(ioend);
+       ASSERT(!error || error == EAGAIN);
 
        /*
         * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
 xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
-       ssize_t                 count,
        struct xfs_bmbt_irec    *imap,
-       int                     flags)
+       int                     type,
+       int                     nonblocking)
 {
-       int                     nmaps = 1;
-       int                     new = 0;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       ssize_t                 count = 1 << inode->i_blkbits;
+       xfs_fileoff_t           offset_fsb, end_fsb;
+       int                     error = 0;
+       int                     bmapi_flags = XFS_BMAPI_ENTIRE;
+       int                     nimaps = 1;
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -XFS_ERROR(EIO);
+
+       if (type == IO_UNWRITTEN)
+               bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+       if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+               if (nonblocking)
+                       return -XFS_ERROR(EAGAIN);
+               xfs_ilock(ip, XFS_ILOCK_SHARED);
+       }
 
-       return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
+       ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+              (ip->i_df.if_flags & XFS_IFEXTENTS));
+       ASSERT(offset <= mp->m_maxioffset);
+
+       if (offset + count > mp->m_maxioffset)
+               count = mp->m_maxioffset - offset;
+       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+       offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+                         bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+       if (error)
+               return -XFS_ERROR(error);
+
+       if (type == IO_DELALLOC &&
+           (!nimaps || isnullstartblock(imap->br_startblock))) {
+               error = xfs_iomap_write_allocate(ip, offset, count, imap);
+               if (!error)
+                       trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+               return -XFS_ERROR(error);
+       }
+
+#ifdef DEBUG
+       if (type == IO_UNWRITTEN) {
+               ASSERT(nimaps);
+               ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+               ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+       }
+#endif
+       if (nimaps)
+               trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+       return 0;
 }
 
 STATIC int
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
 
        submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
                   WRITE_SYNC_PLUG : WRITE, bio);
-       ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
-       bio_put(bio);
 }
 
 STATIC struct bio *
 xfs_alloc_ioend_bio(
        struct buffer_head      *bh)
 {
-       struct bio              *bio;
        int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
-
-       do {
-               bio = bio_alloc(GFP_NOIO, nvecs);
-               nvecs >>= 1;
-       } while (!bio);
+       struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
 
        ASSERT(bio->bi_private == NULL);
        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
-       bio_get(bio);
        return bio;
 }
 
@@ -470,9 +497,8 @@ xfs_submit_ioend(
        /* Pass 1 - start writeback */
        do {
                next = ioend->io_list;
-               for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+               for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
                        xfs_start_buffer_writeback(bh);
-               }
        } while ((ioend = next) != NULL);
 
        /* Pass 2 - submit I/O */
@@ -600,116 +626,12 @@ xfs_map_at_offset(
        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 
-       lock_buffer(bh);
        xfs_map_buffer(inode, bh, imap, offset);
-       bh->b_bdev = xfs_find_bdev_for_inode(inode);
        set_buffer_mapped(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
 }
 
-/*
- * Look for a page at index that is suitable for clustering.
- */
-STATIC unsigned int
-xfs_probe_page(
-       struct page             *page,
-       unsigned int            pg_offset)
-{
-       struct buffer_head      *bh, *head;
-       int                     ret = 0;
-
-       if (PageWriteback(page))
-               return 0;
-       if (!PageDirty(page))
-               return 0;
-       if (!page->mapping)
-               return 0;
-       if (!page_has_buffers(page))
-               return 0;
-
-       bh = head = page_buffers(page);
-       do {
-               if (!buffer_uptodate(bh))
-                       break;
-               if (!buffer_mapped(bh))
-                       break;
-               ret += bh->b_size;
-               if (ret >= pg_offset)
-                       break;
-       } while ((bh = bh->b_this_page) != head);
-
-       return ret;
-}
-
-STATIC size_t
-xfs_probe_cluster(
-       struct inode            *inode,
-       struct page             *startpage,
-       struct buffer_head      *bh,
-       struct buffer_head      *head)
-{
-       struct pagevec          pvec;
-       pgoff_t                 tindex, tlast, tloff;
-       size_t                  total = 0;
-       int                     done = 0, i;
-
-       /* First sum forwards in this page */
-       do {
-               if (!buffer_uptodate(bh) || !buffer_mapped(bh))
-                       return total;
-               total += bh->b_size;
-       } while ((bh = bh->b_this_page) != head);
-
-       /* if we reached the end of the page, sum forwards in following pages */
-       tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-       tindex = startpage->index + 1;
-
-       /* Prune this back to avoid pathological behavior */
-       tloff = min(tlast, startpage->index + 64);
-
-       pagevec_init(&pvec, 0);
-       while (!done && tindex <= tloff) {
-               unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
-               if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-                       break;
-
-               for (i = 0; i < pagevec_count(&pvec); i++) {
-                       struct page *page = pvec.pages[i];
-                       size_t pg_offset, pg_len = 0;
-
-                       if (tindex == tlast) {
-                               pg_offset =
-                                   i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
-                               if (!pg_offset) {
-                                       done = 1;
-                                       break;
-                               }
-                       } else
-                               pg_offset = PAGE_CACHE_SIZE;
-
-                       if (page->index == tindex && trylock_page(page)) {
-                               pg_len = xfs_probe_page(page, pg_offset);
-                               unlock_page(page);
-                       }
-
-                       if (!pg_len) {
-                               done = 1;
-                               break;
-                       }
-
-                       total += pg_len;
-                       tindex++;
-               }
-
-               pagevec_release(&pvec);
-               cond_resched();
-       }
-
-       return total;
-}
-
 /*
  * Test if a given page is suitable for writing as part of an unwritten
  * or delayed allocate extent.
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
                        if (buffer_unwritten(bh))
                                acceptable = (type == IO_UNWRITTEN);
                        else if (buffer_delay(bh))
-                               acceptable = (type == IO_DELAY);
+                               acceptable = (type == IO_DELALLOC);
                        else if (buffer_dirty(bh) && buffer_mapped(bh))
-                               acceptable = (type == IO_NEW);
+                               acceptable = (type == IO_OVERWRITE);
                        else
                                break;
                } while ((bh = bh->b_this_page) != head);
@@ -758,8 +680,7 @@ xfs_convert_page(
        loff_t                  tindex,
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
-       struct writeback_control *wbc,
-       int                     all_bh)
+       struct writeback_control *wbc)
 {
        struct buffer_head      *bh, *head;
        xfs_off_t               end_offset;
@@ -814,37 +735,30 @@ xfs_convert_page(
                        continue;
                }
 
-               if (buffer_unwritten(bh) || buffer_delay(bh)) {
+               if (buffer_unwritten(bh) || buffer_delay(bh) ||
+                   buffer_mapped(bh)) {
                        if (buffer_unwritten(bh))
                                type = IO_UNWRITTEN;
+                       else if (buffer_delay(bh))
+                               type = IO_DELALLOC;
                        else
-                               type = IO_DELAY;
+                               type = IO_OVERWRITE;
 
                        if (!xfs_imap_valid(inode, imap, offset)) {
                                done = 1;
                                continue;
                        }
 
-                       ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-                       ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-
-                       xfs_map_at_offset(inode, bh, imap, offset);
+                       lock_buffer(bh);
+                       if (type != IO_OVERWRITE)
+                               xfs_map_at_offset(inode, bh, imap, offset);
                        xfs_add_to_ioend(inode, bh, offset, type,
                                         ioendp, done);
 
                        page_dirty--;
                        count++;
                } else {
-                       type = IO_NEW;
-                       if (buffer_mapped(bh) && all_bh) {
-                               lock_buffer(bh);
-                               xfs_add_to_ioend(inode, bh, offset,
-                                               type, ioendp, done);
-                               count++;
-                               page_dirty--;
-                       } else {
-                               done = 1;
-                       }
+                       done = 1;
                }
        } while (offset += len, (bh = bh->b_this_page) != head);
 
@@ -876,7 +790,6 @@ xfs_cluster_write(
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
-       int                     all_bh,
        pgoff_t                 tlast)
 {
        struct pagevec          pvec;
@@ -891,7 +804,7 @@ xfs_cluster_write(
 
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                       imap, ioendp, wbc, all_bh);
+                                       imap, ioendp, wbc);
                        if (done)
                                break;
                }
@@ -935,7 +848,7 @@ xfs_aops_discard_page(
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
 
-       if (!xfs_is_delayed_page(page, IO_DELAY))
+       if (!xfs_is_delayed_page(page, IO_DELALLOC))
                goto out_invalidate;
 
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1002,10 +915,10 @@ xfs_vm_writepage(
        unsigned int            type;
        __uint64_t              end_offset;
        pgoff_t                 end_index, last_index;
-       ssize_t                 size, len;
-       int                     flags, err, imap_valid = 0, uptodate = 1;
+       ssize_t                 len;
+       int                     err, imap_valid = 0, uptodate = 1;
        int                     count = 0;
-       int                     all_bh = 0;
+       int                     nonblocking = 0;
 
        trace_xfs_writepage(inode, page, 0);
 
@@ -1056,10 +969,14 @@ xfs_vm_writepage(
 
        bh = head = page_buffers(page);
        offset = page_offset(page);
-       flags = BMAPI_READ;
-       type = IO_NEW;
+       type = IO_OVERWRITE;
+
+       if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+               nonblocking = 1;
 
        do {
+               int new_ioend = 0;
+
                if (offset >= end_offset)
                        break;
                if (!buffer_uptodate(bh))
@@ -1076,90 +993,54 @@ xfs_vm_writepage(
                        continue;
                }
 
-               if (imap_valid)
-                       imap_valid = xfs_imap_valid(inode, &imap, offset);
-
-               if (buffer_unwritten(bh) || buffer_delay(bh)) {
-                       int new_ioend = 0;
-
-                       /*
-                        * Make sure we don't use a read-only iomap
-                        */
-                       if (flags == BMAPI_READ)
-                               imap_valid = 0;
-
-                       if (buffer_unwritten(bh)) {
+               if (buffer_unwritten(bh)) {
+                       if (type != IO_UNWRITTEN) {
                                type = IO_UNWRITTEN;
-                               flags = BMAPI_WRITE | BMAPI_IGNSTATE;
-                       } else if (buffer_delay(bh)) {
-                               type = IO_DELAY;
-                               flags = BMAPI_ALLOCATE;
-
-                               if (wbc->sync_mode == WB_SYNC_NONE)
-                                       flags |= BMAPI_TRYLOCK;
-                       }
-
-                       if (!imap_valid) {
-                               /*
-                                * If we didn't have a valid mapping then we
-                                * need to ensure that we put the new mapping
-                                * in a new ioend structure. This needs to be
-                                * done to ensure that the ioends correctly
-                                * reflect the block mappings at io completion
-                                * for unwritten extent conversion.
-                                */
-                               new_ioend = 1;
-                               err = xfs_map_blocks(inode, offset, len,
-                                               &imap, flags);
-                               if (err)
-                                       goto error;
-                               imap_valid = xfs_imap_valid(inode, &imap,
-                                                           offset);
+                               imap_valid = 0;
                        }
-                       if (imap_valid) {
-                               xfs_map_at_offset(inode, bh, &imap, offset);
-                               xfs_add_to_ioend(inode, bh, offset, type,
-                                                &ioend, new_ioend);
-                               count++;
+               } else if (buffer_delay(bh)) {
+                       if (type != IO_DELALLOC) {
+                               type = IO_DELALLOC;
+                               imap_valid = 0;
                        }
                } else if (buffer_uptodate(bh)) {
-                       /*
-                        * we got here because the buffer is already mapped.
-                        * That means it must already have extents allocated
-                        * underneath it. Map the extent by reading it.
-                        */
-                       if (!imap_valid || flags != BMAPI_READ) {
-                               flags = BMAPI_READ;
-                               size = xfs_probe_cluster(inode, page, bh, head);
-                               err = xfs_map_blocks(inode, offset, size,
-                                               &imap, flags);
-                               if (err)
-                                       goto error;
-                               imap_valid = xfs_imap_valid(inode, &imap,
-                                                           offset);
+                       if (type != IO_OVERWRITE) {
+                               type = IO_OVERWRITE;
+                               imap_valid = 0;
                        }
+               } else {
+                       if (PageUptodate(page)) {
+                               ASSERT(buffer_mapped(bh));
+                               imap_valid = 0;
+                       }
+                       continue;
+               }
 
+               if (imap_valid)
+                       imap_valid = xfs_imap_valid(inode, &imap, offset);
+               if (!imap_valid) {
                        /*
-                        * We set the type to IO_NEW in case we are doing a
-                        * small write at EOF that is extending the file but
-                        * without needing an allocation. We need to update the
-                        * file size on I/O completion in this case so it is
-                        * the same case as having just allocated a new extent
-                        * that we are writing into for the first time.
+                        * If we didn't have a valid mapping then we need to
+                        * put the new mapping into a separate ioend structure.
+                        * This ensures non-contiguous extents always have
+                        * separate ioends, which is particularly important
+                        * for unwritten extent conversion at I/O completion
+                        * time.
                         */
-                       type = IO_NEW;
-                       if (trylock_buffer(bh)) {
-                               if (imap_valid)
-                                       all_bh = 1;
-                               xfs_add_to_ioend(inode, bh, offset, type,
-                                               &ioend, !imap_valid);
-                               count++;
-                       } else {
-                               imap_valid = 0;
-                       }
-               } else if (PageUptodate(page)) {
-                       ASSERT(buffer_mapped(bh));
-                       imap_valid = 0;
+                       new_ioend = 1;
+                       err = xfs_map_blocks(inode, offset, &imap, type,
+                                            nonblocking);
+                       if (err)
+                               goto error;
+                       imap_valid = xfs_imap_valid(inode, &imap, offset);
+               }
+               if (imap_valid) {
+                       lock_buffer(bh);
+                       if (type != IO_OVERWRITE)
+                               xfs_map_at_offset(inode, bh, &imap, offset);
+                       xfs_add_to_ioend(inode, bh, offset, type, &ioend,
+                                        new_ioend);
+                       count++;
                }
 
                if (!iohead)
@@ -1188,7 +1069,7 @@ xfs_vm_writepage(
                        end_index = last_index;
 
                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                       wbc, all_bh, end_index);
+                                 wbc, end_index);
        }
 
        if (iohead)
@@ -1257,13 +1138,19 @@ __xfs_get_blocks(
        int                     create,
        int                     direct)
 {
-       int                     flags = create ? BMAPI_WRITE : BMAPI_READ;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           offset_fsb, end_fsb;
+       int                     error = 0;
+       int                     lockmode = 0;
        struct xfs_bmbt_irec    imap;
+       int                     nimaps = 1;
        xfs_off_t               offset;
        ssize_t                 size;
-       int                     nimap = 1;
        int                     new = 0;
-       int                     error;
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -XFS_ERROR(EIO);
 
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1272,15 +1159,45 @@ __xfs_get_blocks(
        if (!create && direct && offset >= i_size_read(inode))
                return 0;
 
-       if (direct && create)
-               flags |= BMAPI_DIRECT;
+       if (create) {
+               lockmode = XFS_ILOCK_EXCL;
+               xfs_ilock(ip, lockmode);
+       } else {
+               lockmode = xfs_ilock_map_shared(ip);
+       }
+
+       ASSERT(offset <= mp->m_maxioffset);
+       if (offset + size > mp->m_maxioffset)
+               size = mp->m_maxioffset - offset;
+       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+       offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
-       error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
-                         &new);
+       error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+                         XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
        if (error)
-               return -error;
-       if (nimap == 0)
-               return 0;
+               goto out_unlock;
+
+       if (create &&
+           (!nimaps ||
+            (imap.br_startblock == HOLESTARTBLOCK ||
+             imap.br_startblock == DELAYSTARTBLOCK))) {
+               if (direct) {
+                       error = xfs_iomap_write_direct(ip, offset, size,
+                                                      &imap, nimaps);
+               } else {
+                       error = xfs_iomap_write_delay(ip, offset, size, &imap);
+               }
+               if (error)
+                       goto out_unlock;
+
+               trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+       } else if (nimaps) {
+               trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+       } else {
+               trace_xfs_get_blocks_notfound(ip, offset, size);
+               goto out_unlock;
+       }
+       xfs_iunlock(ip, lockmode);
 
        if (imap.br_startblock != HOLESTARTBLOCK &&
            imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1347,6 +1264,10 @@ __xfs_get_blocks(
        }
 
        return 0;
+
+out_unlock:
+       xfs_iunlock(ip, lockmode);
+       return -error;
 }
 
 int
@@ -1434,7 +1355,7 @@ xfs_vm_direct_IO(
        ssize_t                 ret;
 
        if (rw & WRITE) {
-               iocb->private = xfs_alloc_ioend(inode, IO_NEW);
+               iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
 
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
index c5057fb..71f721e 100644 (file)
@@ -22,6 +22,22 @@ extern struct workqueue_struct *xfsdatad_workqueue;
 extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 
+/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+       IO_DIRECT = 0,  /* special case for direct I/O ioends */
+       IO_DELALLOC,    /* mapping covers delalloc region */
+       IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+       IO_OVERWRITE,   /* mapping covers already allocated extent */
+};
+
+#define XFS_IO_TYPES \
+       { 0,                    "" }, \
+       { IO_DELALLOC,          "delalloc" }, \
+       { IO_UNWRITTEN,         "unwritten" }, \
+       { IO_OVERWRITE,         "overwrite" }
+
 /*
  * xfs_ioend struct manages large extent writes for XFS.
  * It can manage several multi-page bio's at once.
index 4c5deb6..92f1f2a 100644 (file)
 
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-static struct shrinker xfs_buf_shake = {
-       .shrink = xfsbufd_wakeup,
-       .seeks = DEFAULT_SEEKS,
-};
 
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -168,8 +163,79 @@ test_page_region(
 }
 
 /*
- *     Internal xfs_buf_t object manipulation
+ * xfs_buf_lru_add - add a buffer to the LRU.
+ *
+ * The LRU takes a new reference to the buffer so that it will only be freed
+ * once the shrinker takes the buffer off the LRU.
  */
+STATIC void
+xfs_buf_lru_add(
+       struct xfs_buf  *bp)
+{
+       struct xfs_buftarg *btp = bp->b_target;
+
+       spin_lock(&btp->bt_lru_lock);
+       if (list_empty(&bp->b_lru)) {
+               atomic_inc(&bp->b_hold);
+               list_add_tail(&bp->b_lru, &btp->bt_lru);
+               btp->bt_lru_nr++;
+       }
+       spin_unlock(&btp->bt_lru_lock);
+}
+
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
+ * bt_lru_lock.
+ */
+STATIC void
+xfs_buf_lru_del(
+       struct xfs_buf  *bp)
+{
+       struct xfs_buftarg *btp = bp->b_target;
+
+       if (list_empty(&bp->b_lru))
+               return;
+
+       spin_lock(&btp->bt_lru_lock);
+       if (!list_empty(&bp->b_lru)) {
+               list_del_init(&bp->b_lru);
+               btp->bt_lru_nr--;
+       }
+       spin_unlock(&btp->bt_lru_lock);
+}
+
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+       struct xfs_buf  *bp)
+{
+       bp->b_flags |= XBF_STALE;
+       atomic_set(&(bp)->b_lru_ref, 0);
+       if (!list_empty(&bp->b_lru)) {
+               struct xfs_buftarg *btp = bp->b_target;
+
+               spin_lock(&btp->bt_lru_lock);
+               if (!list_empty(&bp->b_lru)) {
+                       list_del_init(&bp->b_lru);
+                       btp->bt_lru_nr--;
+                       atomic_dec(&bp->b_hold);
+               }
+               spin_unlock(&btp->bt_lru_lock);
+       }
+       ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
 
 STATIC void
 _xfs_buf_initialize(
@@ -186,7 +252,9 @@ _xfs_buf_initialize(
 
        memset(bp, 0, sizeof(xfs_buf_t));
        atomic_set(&bp->b_hold, 1);
+       atomic_set(&bp->b_lru_ref, 1);
        init_completion(&bp->b_iowait);
+       INIT_LIST_HEAD(&bp->b_lru);
        INIT_LIST_HEAD(&bp->b_list);
        RB_CLEAR_NODE(&bp->b_rbnode);
        sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -262,6 +330,8 @@ xfs_buf_free(
 {
        trace_xfs_buf_free(bp, _RET_IP_);
 
+       ASSERT(list_empty(&bp->b_lru));
+
        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                uint            i;
 
@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages(
                                        __func__, gfp_mask);
 
                        XFS_STATS_INC(xb_page_retries);
-                       xfsbufd_wakeup(NULL, 0, gfp_mask);
                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
@@ -828,6 +897,7 @@ xfs_buf_rele(
 
        if (!pag) {
                ASSERT(!bp->b_relse);
+               ASSERT(list_empty(&bp->b_lru));
                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
                        xfs_buf_free(bp);
@@ -835,13 +905,19 @@ xfs_buf_rele(
        }
 
        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
+
        ASSERT(atomic_read(&bp->b_hold) > 0);
        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
                if (bp->b_relse) {
                        atomic_inc(&bp->b_hold);
                        spin_unlock(&pag->pag_buf_lock);
                        bp->b_relse(bp);
+               } else if (!(bp->b_flags & XBF_STALE) &&
+                          atomic_read(&bp->b_lru_ref)) {
+                       xfs_buf_lru_add(bp);
+                       spin_unlock(&pag->pag_buf_lock);
                } else {
+                       xfs_buf_lru_del(bp);
                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
                        rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
                        spin_unlock(&pag->pag_buf_lock);
@@ -1438,51 +1514,84 @@ xfs_buf_iomove(
  */
 
 /*
- *     Wait for any bufs with callbacks that have been submitted but
- *     have not yet returned... walk the hash list for the target.
+ * Wait for any bufs with callbacks that have been submitted but have not yet
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
  */
 void
 xfs_wait_buftarg(
        struct xfs_buftarg      *btp)
 {
-       struct xfs_perag        *pag;
-       uint                    i;
+       struct xfs_buf          *bp;
 
-       for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
-               pag = xfs_perag_get(btp->bt_mount, i);
-               spin_lock(&pag->pag_buf_lock);
-               while (rb_first(&pag->pag_buf_tree)) {
-                       spin_unlock(&pag->pag_buf_lock);
+restart:
+       spin_lock(&btp->bt_lru_lock);
+       while (!list_empty(&btp->bt_lru)) {
+               bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+               if (atomic_read(&bp->b_hold) > 1) {
+                       spin_unlock(&btp->bt_lru_lock);
                        delay(100);
-                       spin_lock(&pag->pag_buf_lock);
+                       goto restart;
                }
-               spin_unlock(&pag->pag_buf_lock);
-               xfs_perag_put(pag);
+               /*
+                * clear the LRU reference count so the bufer doesn't get
+                * ignored in xfs_buf_rele().
+                */
+               atomic_set(&bp->b_lru_ref, 0);
+               spin_unlock(&btp->bt_lru_lock);
+               xfs_buf_rele(bp);
+               spin_lock(&btp->bt_lru_lock);
        }
+       spin_unlock(&btp->bt_lru_lock);
 }
 
-/*
- *     buftarg list for delwrite queue processing
- */
-static LIST_HEAD(xfs_buftarg_list);
-static DEFINE_SPINLOCK(xfs_buftarg_lock);
-
-STATIC void
-xfs_register_buftarg(
-       xfs_buftarg_t           *btp)
+int
+xfs_buftarg_shrink(
+       struct shrinker         *shrink,
+       int                     nr_to_scan,
+       gfp_t                   mask)
 {
-       spin_lock(&xfs_buftarg_lock);
-       list_add(&btp->bt_list, &xfs_buftarg_list);
-       spin_unlock(&xfs_buftarg_lock);
-}
+       struct xfs_buftarg      *btp = container_of(shrink,
+                                       struct xfs_buftarg, bt_shrinker);
+       struct xfs_buf          *bp;
+       LIST_HEAD(dispose);
 
-STATIC void
-xfs_unregister_buftarg(
-       xfs_buftarg_t           *btp)
-{
-       spin_lock(&xfs_buftarg_lock);
-       list_del(&btp->bt_list);
-       spin_unlock(&xfs_buftarg_lock);
+       if (!nr_to_scan)
+               return btp->bt_lru_nr;
+
+       spin_lock(&btp->bt_lru_lock);
+       while (!list_empty(&btp->bt_lru)) {
+               if (nr_to_scan-- <= 0)
+                       break;
+
+               bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+
+               /*
+                * Decrement the b_lru_ref count unless the value is already
+                * zero. If the value is already zero, we need to reclaim the
+                * buffer, otherwise it gets another trip through the LRU.
+                */
+               if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+                       list_move_tail(&bp->b_lru, &btp->bt_lru);
+                       continue;
+               }
+
+               /*
+                * remove the buffer from the LRU now to avoid needing another
+                * lock round trip inside xfs_buf_rele().
+                */
+               list_move(&bp->b_lru, &dispose);
+               btp->bt_lru_nr--;
+       }
+       spin_unlock(&btp->bt_lru_lock);
+
+       while (!list_empty(&dispose)) {
+               bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+               list_del_init(&bp->b_lru);
+               xfs_buf_rele(bp);
+       }
+
+       return btp->bt_lru_nr;
 }
 
 void
@@ -1490,17 +1599,14 @@ xfs_free_buftarg(
        struct xfs_mount        *mp,
        struct xfs_buftarg      *btp)
 {
+       unregister_shrinker(&btp->bt_shrinker);
+
        xfs_flush_buftarg(btp, 1);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
                xfs_blkdev_issue_flush(btp);
        iput(btp->bt_mapping->host);
 
-       /* Unregister the buftarg first so that we don't get a
-        * wakeup finding a non-existent task
-        */
-       xfs_unregister_buftarg(btp);
        kthread_stop(btp->bt_task);
-
        kmem_free(btp);
 }
 
@@ -1597,20 +1703,13 @@ xfs_alloc_delwrite_queue(
        xfs_buftarg_t           *btp,
        const char              *fsname)
 {
-       int     error = 0;
-
-       INIT_LIST_HEAD(&btp->bt_list);
        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
        spin_lock_init(&btp->bt_delwrite_lock);
        btp->bt_flags = 0;
        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
-       if (IS_ERR(btp->bt_task)) {
-               error = PTR_ERR(btp->bt_task);
-               goto out_error;
-       }
-       xfs_register_buftarg(btp);
-out_error:
-       return error;
+       if (IS_ERR(btp->bt_task))
+               return PTR_ERR(btp->bt_task);
+       return 0;
 }
 
 xfs_buftarg_t *
@@ -1627,12 +1726,17 @@ xfs_alloc_buftarg(
        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
        btp->bt_bdev = bdev;
+       INIT_LIST_HEAD(&btp->bt_lru);
+       spin_lock_init(&btp->bt_lru_lock);
        if (xfs_setsize_buftarg_early(btp, bdev))
                goto error;
        if (xfs_mapping_buftarg(btp, bdev))
                goto error;
        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
+       btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+       btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+       register_shrinker(&btp->bt_shrinker);
        return btp;
 
 error:
@@ -1737,27 +1841,6 @@ xfs_buf_runall_queues(
        flush_workqueue(queue);
 }
 
-STATIC int
-xfsbufd_wakeup(
-       struct shrinker         *shrink,
-       int                     priority,
-       gfp_t                   mask)
-{
-       xfs_buftarg_t           *btp;
-
-       spin_lock(&xfs_buftarg_lock);
-       list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
-               if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
-                       continue;
-               if (list_empty(&btp->bt_delwrite_queue))
-                       continue;
-               set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
-               wake_up_process(btp->bt_task);
-       }
-       spin_unlock(&xfs_buftarg_lock);
-       return 0;
-}
-
 /*
  * Move as many buffers as specified to the supplied list
  * idicating if we skipped any buffers to prevent deadlocks.
@@ -1952,7 +2035,6 @@ xfs_buf_init(void)
        if (!xfsconvertd_workqueue)
                goto out_destroy_xfsdatad_workqueue;
 
-       register_shrinker(&xfs_buf_shake);
        return 0;
 
  out_destroy_xfsdatad_workqueue:
@@ -1968,7 +2050,6 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-       unregister_shrinker(&xfs_buf_shake);
        destroy_workqueue(xfsconvertd_workqueue);
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
index 383a3f3..a76c242 100644 (file)
@@ -128,10 +128,15 @@ typedef struct xfs_buftarg {
 
        /* per device delwri queue */
        struct task_struct      *bt_task;
-       struct list_head        bt_list;
        struct list_head        bt_delwrite_queue;
        spinlock_t              bt_delwrite_lock;
        unsigned long           bt_flags;
+
+       /* LRU control structures */
+       struct shrinker         bt_shrinker;
+       struct list_head        bt_lru;
+       spinlock_t              bt_lru_lock;
+       unsigned int            bt_lru_nr;
 } xfs_buftarg_t;
 
 /*
@@ -164,9 +169,11 @@ typedef struct xfs_buf {
        xfs_off_t               b_file_offset;  /* offset in file */
        size_t                  b_buffer_length;/* size of buffer in bytes */
        atomic_t                b_hold;         /* reference count */
+       atomic_t                b_lru_ref;      /* lru reclaim ref count */
        xfs_buf_flags_t         b_flags;        /* status flags */
        struct semaphore        b_sema;         /* semaphore for lockables */
 
+       struct list_head        b_lru;          /* lru list */
        wait_queue_head_t       b_waiters;      /* unpin waiters */
        struct list_head        b_list;
        struct xfs_perag        *b_pag;         /* contains rbtree root */
@@ -264,7 +271,8 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_ZEROFLAGS(bp)  ((bp)->b_flags &= \
                ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
 
-#define XFS_BUF_STALE(bp)      ((bp)->b_flags |= XBF_STALE)
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp)      xfs_buf_stale(bp);
 #define XFS_BUF_UNSTALE(bp)    ((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)    ((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp)        do {                            \
@@ -328,9 +336,15 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_SIZE(bp)               ((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)      ((bp)->b_buffer_length = (cnt))
 
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)   do { } while (0)
+static inline void
+xfs_buf_set_ref(
+       struct xfs_buf  *bp,
+       int             lru_ref)
+{
+       atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)   xfs_buf_set_ref(bp, ref)
 #define XFS_BUF_SET_VTYPE(bp, type)            do { } while (0)
-#define XFS_BUF_SET_REF(bp, ref)               do { } while (0)
 
 #define XFS_BUF_ISPINNED(bp)   atomic_read(&((bp)->b_pin_count))
 
index 3764d74..fc0114d 100644 (file)
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
        else
                fileid_type = FILEID_INO32_GEN_PARENT;
 
-       /* filesystem may contain 64bit inode numbers */
-       if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
+       /*
+        * If the the filesystem may contain 64bit inode numbers, we need
+        * to use larger file handles that can represent them.
+        *
+        * While we only allocate inodes that do not fit into 32 bits any
+        * large enough filesystem may contain them, thus the slightly
+        * confusing looking conditional below.
+        */
+       if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+           (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
                fileid_type |= XFS_FILEID_TYPE_64FLAG;
 
        /*
index 214ddd7..0964949 100644 (file)
@@ -37,7 +37,6 @@
 
 #include <kmem.h>
 #include <mrlock.h>
-#include <sv.h>
 #include <time.h>
 
 #include <support/debug.h>
index 064f964..c51faaa 100644 (file)
@@ -834,8 +834,11 @@ xfsaild_wakeup(
        struct xfs_ail          *ailp,
        xfs_lsn_t               threshold_lsn)
 {
-       ailp->xa_target = threshold_lsn;
-       wake_up_process(ailp->xa_task);
+       /* only ever move the target forwards */
+       if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
+               ailp->xa_target = threshold_lsn;
+               wake_up_process(ailp->xa_task);
+       }
 }
 
 STATIC int
@@ -847,8 +850,17 @@ xfsaild(
        long            tout = 0; /* milliseconds */
 
        while (!kthread_should_stop()) {
-               schedule_timeout_interruptible(tout ?
-                               msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+               /*
+                * for short sleeps indicating congestion, don't allow us to
+                * get woken early. Otherwise all we do is bang on the AIL lock
+                * without making progress.
+                */
+               if (tout && tout <= 20)
+                       __set_current_state(TASK_KILLABLE);
+               else
+                       __set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(tout ?
+                                msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
 
                /* swsusp */
                try_to_freeze();
@@ -1118,6 +1130,8 @@ xfs_fs_evict_inode(
         */
        ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+       lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                       &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
 
        xfs_inactive(ip);
 }
index afb0d7c..a02480d 100644 (file)
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
 {
        struct inode            *inode = VFS_I(ip);
 
+       ASSERT(rcu_read_lock_held());
+
+       /*
+        * check for stale RCU freed inode
+        *
+        * If the inode has been reallocated, it doesn't matter if it's not in
+        * the AG we are walking - we are walking for writeback, so if it
+        * passes all the "valid inode" checks and is dirty, then we'll write
+        * it back anyway.  If it has been reallocated and still being
+        * initialised, the XFS_INEW check below will catch it.
+        */
+       spin_lock(&ip->i_flags_lock);
+       if (!ip->i_ino)
+               goto out_unlock_noent;
+
+       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+       if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+               goto out_unlock_noent;
+       spin_unlock(&ip->i_flags_lock);
+
        /* nothing to sync during shutdown */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return EFSCORRUPTED;
 
-       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-       if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-               return ENOENT;
-
        /* If we can't grab the inode, it must on it's way to reclaim. */
        if (!igrab(inode))
                return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
 
        /* inode is valid */
        return 0;
+
+out_unlock_noent:
+       spin_unlock(&ip->i_flags_lock);
+       return ENOENT;
 }
 
 STATIC int
@@ -98,12 +118,12 @@ restart:
                int             error = 0;
                int             i;
 
-               read_lock(&pag->pag_ici_lock);
+               rcu_read_lock();
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH);
                if (!nr_found) {
-                       read_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
                        break;
                }
 
@@ -118,18 +138,26 @@ restart:
                                batch[i] = NULL;
 
                        /*
-                        * Update the index for the next lookup. Catch overflows
-                        * into the next AG range which can occur if we have inodes
-                        * in the last block of the AG and we are currently
-                        * pointing to the last inode.
+                        * Update the index for the next lookup. Catch
+                        * overflows into the next AG range which can occur if
+                        * we have inodes in the last block of the AG and we
+                        * are currently pointing to the last inode.
+                        *
+                        * Because we may see inodes that are from the wrong AG
+                        * due to RCU freeing and reallocation, only update the
+                        * index if it lies in this AG. It was a race that lead
+                        * us to see this inode, so another lookup from the
+                        * same index will not find it again.
                         */
+                       if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                               continue;
                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                done = 1;
                }
 
                /* unlock now we've grabbed the inodes. */
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
 
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
@@ -592,12 +620,12 @@ xfs_inode_set_reclaim_tag(
        struct xfs_perag *pag;
 
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-       write_lock(&pag->pag_ici_lock);
+       spin_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
        __xfs_inode_set_reclaim_tag(pag, ip);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
-       write_unlock(&pag->pag_ici_lock);
+       spin_unlock(&pag->pag_ici_lock);
        xfs_perag_put(pag);
 }
 
@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
        struct xfs_inode        *ip,
        int                     flags)
 {
+       ASSERT(rcu_read_lock_held());
+
+       /* quick check for stale RCU freed inode */
+       if (!ip->i_ino)
+               return 1;
 
        /*
-        * do some unlocked checks first to avoid unnecceary lock traffic.
+        * do some unlocked checks first to avoid unnecessary lock traffic.
         * The first is a flush lock check, the second is a already in reclaim
         * check. Only do these checks if we are not going to block on locks.
         */
@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
         * The radix tree lock here protects a thread in xfs_iget from racing
         * with us starting reclaim on the inode.  Once we have the
         * XFS_IRECLAIM flag set it will not touch us.
+        *
+        * Due to RCU lookup, we may find inodes that have been freed and only
+        * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+        * aren't candidates for reclaim at all, so we must check the
+        * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
         */
        spin_lock(&ip->i_flags_lock);
-       ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-       if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
-               /* ignore as it is already under reclaim */
+       if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+           __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+               /* not a reclaim candidate. */
                spin_unlock(&ip->i_flags_lock);
                return 1;
        }
@@ -795,12 +833,12 @@ reclaim:
         * added to the tree assert that it's been there before to catch
         * problems with the inode life time early on.
         */
-       write_lock(&pag->pag_ici_lock);
+       spin_lock(&pag->pag_ici_lock);
        if (!radix_tree_delete(&pag->pag_ici_root,
                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                ASSERT(0);
        __xfs_inode_clear_reclaim(pag, ip);
-       write_unlock(&pag->pag_ici_lock);
+       spin_unlock(&pag->pag_ici_lock);
 
        /*
         * Here we do an (almost) spurious inode lock in order to coordinate
@@ -864,14 +902,14 @@ restart:
                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                        int     i;
 
-                       write_lock(&pag->pag_ici_lock);
+                       rcu_read_lock();
                        nr_found = radix_tree_gang_lookup_tag(
                                        &pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH,
                                        XFS_ICI_RECLAIM_TAG);
                        if (!nr_found) {
-                               write_unlock(&pag->pag_ici_lock);
+                               rcu_read_unlock();
                                break;
                        }
 
@@ -891,14 +929,24 @@ restart:
                                 * occur if we have inodes in the last block of
                                 * the AG and we are currently pointing to the
                                 * last inode.
+                                *
+                                * Because we may see inodes that are from the
+                                * wrong AG due to RCU freeing and
+                                * reallocation, only update the index if it
+                                * lies in this AG. It was a race that lead us
+                                * to see this inode, so another lookup from
+                                * the same index will not find it again.
                                 */
+                               if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                               pag->pag_agno)
+                                       continue;
                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                        done = 1;
                        }
 
                        /* unlock now we've grabbed the inodes. */
-                       write_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
 
                        for (i = 0; i < nr_found; i++) {
                                if (!batch[i])
index acef2e9..647af2a 100644 (file)
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __field(int, curr_res)
                __field(int, unit_res)
                __field(unsigned int, flags)
-               __field(void *, reserve_headq)
-               __field(void *, write_headq)
+               __field(int, reserveq)
+               __field(int, writeq)
                __field(int, grant_reserve_cycle)
                __field(int, grant_reserve_bytes)
                __field(int, grant_write_cycle)
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_res = tic->t_curr_res;
                __entry->unit_res = tic->t_unit_res;
                __entry->flags = tic->t_flags;
-               __entry->reserve_headq = log->l_reserve_headq;
-               __entry->write_headq = log->l_write_headq;
-               __entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
-               __entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
-               __entry->grant_write_cycle = log->l_grant_write_cycle;
-               __entry->grant_write_bytes = log->l_grant_write_bytes;
+               __entry->reserveq = list_empty(&log->l_reserveq);
+               __entry->writeq = list_empty(&log->l_writeq);
+               xlog_crack_grant_head(&log->l_grant_reserve_head,
+                               &__entry->grant_reserve_cycle,
+                               &__entry->grant_reserve_bytes);
+               xlog_crack_grant_head(&log->l_grant_write_head,
+                               &__entry->grant_write_cycle,
+                               &__entry->grant_write_bytes);
                __entry->curr_cycle = log->l_curr_cycle;
                __entry->curr_block = log->l_curr_block;
-               __entry->tail_lsn = log->l_tail_lsn;
+               __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
        ),
        TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
-                 "t_unit_res %u t_flags %s reserve_headq 0x%p "
-                 "write_headq 0x%p grant_reserve_cycle %d "
+                 "t_unit_res %u t_flags %s reserveq %s "
+                 "writeq %s grant_reserve_cycle %d "
                  "grant_reserve_bytes %d grant_write_cycle %d "
                  "grant_write_bytes %d curr_cycle %d curr_block %d "
                  "tail_cycle %d tail_block %d",
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                  __entry->curr_res,
                  __entry->unit_res,
                  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
-                 __entry->reserve_headq,
-                 __entry->write_headq,
+                 __entry->reserveq ? "empty" : "active",
+                 __entry->writeq ? "empty" : "active",
                  __entry->grant_reserve_cycle,
                  __entry->grant_reserve_bytes,
                  __entry->grant_write_cycle,
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
 
-DECLARE_EVENT_CLASS(xfs_iomap_class,
+DECLARE_EVENT_CLASS(xfs_imap_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-                int flags, struct xfs_bmbt_irec *irec),
-       TP_ARGS(ip, offset, count, flags, irec),
+                int type, struct xfs_bmbt_irec *irec),
+       TP_ARGS(ip, offset, count, type, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
-               __field(int, flags)
+               __field(int, type)
                __field(xfs_fileoff_t, startoff)
                __field(xfs_fsblock_t, startblock)
                __field(xfs_filblks_t, blockcount)
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
-               __entry->flags = flags;
+               __entry->type = type;
                __entry->startoff = irec ? irec->br_startoff : 0;
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-                 "offset 0x%llx count %zd flags %s "
+                 "offset 0x%llx count %zd type %s "
                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                  __entry->new_size,
                  __entry->offset,
                  __entry->count,
-                 __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+                 __print_symbolic(__entry->type, XFS_IO_TYPES),
                  __entry->startoff,
                  (__int64_t)__entry->startblock,
                  __entry->blockcount)
 )
 
 #define DEFINE_IOMAP_EVENT(name)       \
-DEFINE_EVENT(xfs_iomap_class, name,    \
+DEFINE_EVENT(xfs_imap_class, name,     \
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-                int flags, struct xfs_bmbt_irec *irec),                \
-       TP_ARGS(ip, offset, count, flags, irec))
-DEFINE_IOMAP_EVENT(xfs_iomap_enter);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+                int type, struct xfs_bmbt_irec *irec),         \
+       TP_ARGS(ip, offset, count, type, irec))
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name,   \
        TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 
 
 TRACE_EVENT(xfs_itruncate_start,
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
        TP_PROTO(struct xfs_alloc_arg *args), \
        TP_ARGS(args))
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
index faf8e1a..d22aa31 100644 (file)
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
        ASSERT(list_empty(&dqp->q_freelist));
 
        mutex_destroy(&dqp->q_qlock);
-       sv_destroy(&dqp->q_pinwait);
        kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
 
        atomic_dec(&xfs_Gqm->qm_totaldquots);
index 63c7a1a..58632cc 100644 (file)
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
 
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
 
-       rwlock_t        pag_ici_lock;   /* incore inode lock */
+       spinlock_t      pag_ici_lock;   /* incore inode cache lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
        int             pag_ici_reclaimable;    /* reclaimable inodes */
        struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
index 112abc4..fa8723f 100644 (file)
@@ -577,61 +577,58 @@ xfs_alloc_ag_vextent_exact(
        xfs_extlen_t    rlen;   /* length of returned extent */
 
        ASSERT(args->alignment == 1);
+
        /*
         * Allocate/initialize a cursor for the by-number freespace btree.
         */
        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_BNO);
+                                         args->agno, XFS_BTNUM_BNO);
+
        /*
         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
         * Look for the closest free block <= bno, it must contain bno
         * if any free block does.
         */
-       if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+       error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+       if (error)
                goto error0;
-       if (!i) {
-               /*
-                * Didn't find it, return null.
-                */
-               xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-               args->agbno = NULLAGBLOCK;
-               return 0;
-       }
+       if (!i)
+               goto not_found;
+
        /*
         * Grab the freespace record.
         */
-       if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+       error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+       if (error)
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        ASSERT(fbno <= args->agbno);
        minend = args->agbno + args->minlen;
        maxend = args->agbno + args->maxlen;
        fend = fbno + flen;
+
        /*
         * Give up if the freespace isn't long enough for the minimum request.
         */
-       if (fend < minend) {
-               xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-               args->agbno = NULLAGBLOCK;
-               return 0;
-       }
+       if (fend < minend)
+               goto not_found;
+
        /*
         * End of extent will be smaller of the freespace end and the
         * maximal requested end.
-        */
-       end = XFS_AGBLOCK_MIN(fend, maxend);
-       /*
+        *
         * Fix the length according to mod and prod if given.
         */
+       end = XFS_AGBLOCK_MIN(fend, maxend);
        args->len = end - args->agbno;
        xfs_alloc_fix_len(args);
-       if (!xfs_alloc_fix_minleft(args)) {
-               xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-               return 0;
-       }
+       if (!xfs_alloc_fix_minleft(args))
+               goto not_found;
+
        rlen = args->len;
        ASSERT(args->agbno + rlen <= fend);
        end = args->agbno + rlen;
+
        /*
         * We are allocating agbno for rlen [agbno .. end]
         * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +637,25 @@ xfs_alloc_ag_vextent_exact(
                args->agno, XFS_BTNUM_CNT);
        ASSERT(args->agbno + args->len <=
                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-       if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
-                       args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+       error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
+                                     args->len, XFSA_FIXUP_BNO_OK);
+       if (error) {
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
                goto error0;
        }
+
        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 
-       trace_xfs_alloc_exact_done(args);
        args->wasfromfl = 0;
+       trace_xfs_alloc_exact_done(args);
+       return 0;
+
+not_found:
+       /* Didn't find it, return null. */
+       xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+       args->agbno = NULLAGBLOCK;
+       trace_xfs_alloc_exact_notfound(args);
        return 0;
 
 error0:
@@ -658,6 +664,95 @@ error0:
        return error;
 }
 
+/*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+       struct xfs_alloc_arg    *args,  /* allocation argument structure */
+       struct xfs_btree_cur    **gcur, /* good cursor */
+       struct xfs_btree_cur    **scur, /* searching cursor */
+       xfs_agblock_t           gdiff,  /* difference for search comparison */
+       xfs_agblock_t           *sbno,  /* extent found by search */
+       xfs_extlen_t            *slen,
+       xfs_extlen_t            *slena, /* aligned length */
+       int                     dir)    /* 0 = search right, 1 = search left */
+{
+       xfs_agblock_t           bno;
+       xfs_agblock_t           new;
+       xfs_agblock_t           sdiff;
+       int                     error;
+       int                     i;
+
+       /* The good extent is perfect, no need to  search. */
+       if (!gdiff)
+               goto out_use_good;
+
+       /*
+        * Look until we find a better one, run out of space or run off the end.
+        */
+       do {
+               error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+               if (error)
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
+                                         args->minlen, &bno, slena);
+
+               /*
+                * The good extent is closer than this one.
+                */
+               if (!dir) {
+                       if (bno >= args->agbno + gdiff)
+                               goto out_use_good;
+               } else {
+                       if (bno <= args->agbno - gdiff)
+                               goto out_use_good;
+               }
+
+               /*
+                * Same distance, compare length and pick the best.
+                */
+               if (*slena >= args->minlen) {
+                       args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+                       xfs_alloc_fix_len(args);
+
+                       sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                                      args->alignment, *sbno,
+                                                      *slen, &new);
+
+                       /*
+                        * Choose closer size and invalidate other cursor.
+                        */
+                       if (sdiff < gdiff)
+                               goto out_use_search;
+                       goto out_use_good;
+               }
+
+               if (!dir)
+                       error = xfs_btree_increment(*scur, 0, &i);
+               else
+                       error = xfs_btree_decrement(*scur, 0, &i);
+               if (error)
+                       goto error0;
+       } while (i);
+
+out_use_good:
+       xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+       *scur = NULL;
+       return 0;
+
+out_use_search:
+       xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+       *gcur = NULL;
+       return 0;
+
+error0:
+       /* caller invalidates cursors */
+       return error;
+}
+
 /*
  * Allocate a variable extent near bno in the allocation group agno.
  * Extent's length (returned in len) will be between minlen and maxlen,
@@ -925,203 +1020,45 @@ xfs_alloc_ag_vextent_near(
                        }
                }
        } while (bno_cur_lt || bno_cur_gt);
+
        /*
         * Got both cursors still active, need to find better entry.
         */
        if (bno_cur_lt && bno_cur_gt) {
-               /*
-                * Left side is long enough, look for a right side entry.
-                */
                if (ltlena >= args->minlen) {
                        /*
-                        * Fix up the length.
+                        * Left side is good, look for a right side entry.
                         */
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                       rlen = args->len;
-                       ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+                       ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
                                args->alignment, ltbno, ltlen, &ltnew);
+
+                       error = xfs_alloc_find_best_extent(args,
+                                               &bno_cur_lt, &bno_cur_gt,
+                                               ltdiff, &gtbno, &gtlen, &gtlena,
+                                               0 /* search right */);
+               } else {
+                       ASSERT(gtlena >= args->minlen);
+
                        /*
-                        * Not perfect.
-                        */
-                       if (ltdiff) {
-                               /*
-                                * Look until we find a better one, run out of
-                                * space, or run off the end.
-                                */
-                               while (bno_cur_lt && bno_cur_gt) {
-                                       if ((error = xfs_alloc_get_rec(
-                                                       bno_cur_gt, &gtbno,
-                                                       &gtlen, &i)))
-                                               goto error0;
-                                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                       xfs_alloc_compute_aligned(gtbno, gtlen,
-                                               args->alignment, args->minlen,
-                                               &gtbnoa, &gtlena);
-                                       /*
-                                        * The left one is clearly better.
-                                        */
-                                       if (gtbnoa >= args->agbno + ltdiff) {
-                                               xfs_btree_del_cursor(
-                                                       bno_cur_gt,
-                                                       XFS_BTREE_NOERROR);
-                                               bno_cur_gt = NULL;
-                                               break;
-                                       }
-                                       /*
-                                        * If we reach a big enough entry,
-                                        * compare the two and pick the best.
-                                        */
-                                       if (gtlena >= args->minlen) {
-                                               args->len =
-                                                       XFS_EXTLEN_MIN(gtlena,
-                                                               args->maxlen);
-                                               xfs_alloc_fix_len(args);
-                                               rlen = args->len;
-                                               gtdiff = xfs_alloc_compute_diff(
-                                                       args->agbno, rlen,
-                                                       args->alignment,
-                                                       gtbno, gtlen, &gtnew);
-                                               /*
-                                                * Right side is better.
-                                                */
-                                               if (gtdiff < ltdiff) {
-                                                       xfs_btree_del_cursor(
-                                                               bno_cur_lt,
-                                                               XFS_BTREE_NOERROR);
-                                                       bno_cur_lt = NULL;
-                                               }
-                                               /*
-                                                * Left side is better.
-                                                */
-                                               else {
-                                                       xfs_btree_del_cursor(
-                                                               bno_cur_gt,
-                                                               XFS_BTREE_NOERROR);
-                                                       bno_cur_gt = NULL;
-                                               }
-                                               break;
-                                       }
-                                       /*
-                                        * Fell off the right end.
-                                        */
-                                       if ((error = xfs_btree_increment(
-                                                       bno_cur_gt, 0, &i)))
-                                               goto error0;
-                                       if (!i) {
-                                               xfs_btree_del_cursor(
-                                                       bno_cur_gt,
-                                                       XFS_BTREE_NOERROR);
-                                               bno_cur_gt = NULL;
-                                               break;
-                                       }
-                               }
-                       }
-                       /*
-                        * The left side is perfect, trash the right side.
-                        */
-                       else {
-                               xfs_btree_del_cursor(bno_cur_gt,
-                                                    XFS_BTREE_NOERROR);
-                               bno_cur_gt = NULL;
-                       }
-               }
-               /*
-                * It's the right side that was found first, look left.
-                */
-               else {
-                       /*
-                        * Fix up the length.
+                        * Right side is good, look for a left side entry.
                         */
                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                       rlen = args->len;
-                       gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+                       gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
                                args->alignment, gtbno, gtlen, &gtnew);
-                       /*
-                        * Right side entry isn't perfect.
-                        */
-                       if (gtdiff) {
-                               /*
-                                * Look until we find a better one, run out of
-                                * space, or run off the end.
-                                */
-                               while (bno_cur_lt && bno_cur_gt) {
-                                       if ((error = xfs_alloc_get_rec(
-                                                       bno_cur_lt, &ltbno,
-                                                       &ltlen, &i)))
-                                               goto error0;
-                                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                       xfs_alloc_compute_aligned(ltbno, ltlen,
-                                               args->alignment, args->minlen,
-                                               &ltbnoa, &ltlena);
-                                       /*
-                                        * The right one is clearly better.
-                                        */
-                                       if (ltbnoa <= args->agbno - gtdiff) {
-                                               xfs_btree_del_cursor(
-                                                       bno_cur_lt,
-                                                       XFS_BTREE_NOERROR);
-                                               bno_cur_lt = NULL;
-                                               break;
-                                       }
-                                       /*
-                                        * If we reach a big enough entry,
-                                        * compare the two and pick the best.
-                                        */
-                                       if (ltlena >= args->minlen) {
-                                               args->len = XFS_EXTLEN_MIN(
-                                                       ltlena, args->maxlen);
-                                               xfs_alloc_fix_len(args);
-                                               rlen = args->len;
-                                               ltdiff = xfs_alloc_compute_diff(
-                                                       args->agbno, rlen,
-                                                       args->alignment,
-                                                       ltbno, ltlen, &ltnew);
-                                               /*
-                                                * Left side is better.
-                                                */
-                                               if (ltdiff < gtdiff) {
-                                                       xfs_btree_del_cursor(
-                                                               bno_cur_gt,
-                                                               XFS_BTREE_NOERROR);
-                                                       bno_cur_gt = NULL;
-                                               }
-                                               /*
-                                                * Right side is better.
-                                                */
-                                               else {
-                                                       xfs_btree_del_cursor(
-                                                               bno_cur_lt,
-                                                               XFS_BTREE_NOERROR);
-                                                       bno_cur_lt = NULL;
-                                               }
-                                               break;
-                                       }
-                                       /*
-                                        * Fell off the left end.
-                                        */
-                                       if ((error = xfs_btree_decrement(
-                                                       bno_cur_lt, 0, &i)))
-                                               goto error0;
-                                       if (!i) {
-                                               xfs_btree_del_cursor(bno_cur_lt,
-                                                       XFS_BTREE_NOERROR);
-                                               bno_cur_lt = NULL;
-                                               break;
-                                       }
-                               }
-                       }
-                       /*
-                        * The right side is perfect, trash the left side.
-                        */
-                       else {
-                               xfs_btree_del_cursor(bno_cur_lt,
-                                       XFS_BTREE_NOERROR);
-                               bno_cur_lt = NULL;
-                       }
+
+                       error = xfs_alloc_find_best_extent(args,
+                                               &bno_cur_gt, &bno_cur_lt,
+                                               gtdiff, &ltbno, &ltlen, &ltlena,
+                                               1 /* search left */);
                }
+
+               if (error)
+                       goto error0;
        }
+
        /*
         * If we couldn't get anything, give up.
         */
@@ -1130,6 +1067,7 @@ xfs_alloc_ag_vextent_near(
                args->agbno = NULLAGBLOCK;
                return 0;
        }
+
        /*
         * At this point we have selected a freespace entry, either to the
         * left or to the right.  If it's on the right, copy all the
@@ -1146,6 +1084,7 @@ xfs_alloc_ag_vextent_near(
                j = 1;
        } else
                j = 0;
+
        /*
         * Fix up the length and compute the useful address.
         */
index a6cff8e..71e90dc 100644 (file)
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
         * It didn't all fit, so we have to sort everything on hashval.
         */
        sbsize = sf->hdr.count * sizeof(*sbuf);
-       sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+       sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
 
        /*
         * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                                args.dp = context->dp;
                                args.whichfork = XFS_ATTR_FORK;
                                args.valuelen = valuelen;
-                               args.value = kmem_alloc(valuelen, KM_SLEEP);
+                               args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
                                args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
                                args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
                                retval = xfs_attr_rmtval_get(&args);
index 04f9cca..2f9e97c 100644 (file)
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
                return error;
        }
        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-       if (bp != NULL) {
+       if (bp)
                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
-       }
        *bpp = bp;
        return 0;
 }
@@ -944,13 +943,13 @@ xfs_btree_set_refs(
        switch (cur->bc_btnum) {
        case XFS_BTNUM_BNO:
        case XFS_BTNUM_CNT:
-               XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+               XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
                break;
        case XFS_BTNUM_INO:
-               XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+               XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
                break;
        case XFS_BTNUM_BMAP:
-               XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+               XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
                break;
        default:
                ASSERT(0);
index 2686d0d..ed2b65f 100644 (file)
@@ -142,7 +142,7 @@ xfs_buf_item_log_check(
 #endif
 
 STATIC void    xfs_buf_error_relse(xfs_buf_t *bp);
-STATIC void    xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
+STATIC void    xfs_buf_do_callbacks(struct xfs_buf *bp);
 
 /*
  * This returns the number of log iovecs needed to log the
@@ -450,7 +450,7 @@ xfs_buf_item_unpin(
                 * xfs_trans_ail_delete() drops the AIL lock.
                 */
                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
-                       xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
+                       xfs_buf_do_callbacks(bp);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                } else {
@@ -918,15 +918,26 @@ xfs_buf_attach_iodone(
        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
 }
 
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
 STATIC void
 xfs_buf_do_callbacks(
-       xfs_buf_t       *bp,
-       xfs_log_item_t  *lip)
+       struct xfs_buf          *bp)
 {
-       xfs_log_item_t  *nlip;
+       struct xfs_log_item     *lip;
 
-       while (lip != NULL) {
-               nlip = lip->li_bio_list;
+       while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
+               XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
                ASSERT(lip->li_cb != NULL);
                /*
                 * Clear the next pointer so we don't have any
@@ -936,7 +947,6 @@ xfs_buf_do_callbacks(
                 */
                lip->li_bio_list = NULL;
                lip->li_cb(bp, lip);
-               lip = nlip;
        }
 }
 
@@ -970,7 +980,7 @@ xfs_buf_iodone_callbacks(
                        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
                        XFS_BUF_SUPER_STALE(bp);
                        trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                       xfs_buf_do_callbacks(bp, lip);
+                       xfs_buf_do_callbacks(bp);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                        xfs_buf_ioend(bp, 0);
@@ -1029,7 +1039,7 @@ xfs_buf_iodone_callbacks(
                return;
        }
 
-       xfs_buf_do_callbacks(bp, lip);
+       xfs_buf_do_callbacks(bp);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
        xfs_buf_ioend(bp, 0);
@@ -1063,7 +1073,7 @@ xfs_buf_error_relse(
         * We have to unpin the pinned buffers so do the
         * callbacks.
         */
-       xfs_buf_do_callbacks(bp, lip);
+       xfs_buf_do_callbacks(bp);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
        XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
index 0e2ed43..b6ecd20 100644 (file)
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
        xfs_buf_log_format_t    bli_format;     /* in-log header */
 } xfs_buf_log_item_t;
 
-/*
- * This structure is used during recovery to record the buf log
- * items which have been canceled and should not be replayed.
- */
-typedef struct xfs_buf_cancel {
-       xfs_daddr_t             bc_blkno;
-       uint                    bc_len;
-       int                     bc_refcount;
-       struct xfs_buf_cancel   *bc_next;
-} xfs_buf_cancel_t;
-
 void   xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
 void   xfs_buf_item_relse(struct xfs_buf *);
 void   xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
index a55e687..75f2ef6 100644 (file)
@@ -47,6 +47,28 @@ xfs_efi_item_free(
                kmem_zone_free(xfs_efi_zone, efip);
 }
 
+/*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the
+ * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
+ * the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+       struct xfs_efi_log_item *efip)
+{
+       struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+
+       if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+               spin_lock(&ailp->xa_lock);
+               /* xfs_trans_ail_delete() drops the AIL lock. */
+               xfs_trans_ail_delete(ailp, &efip->efi_item);
+               xfs_efi_item_free(efip);
+       }
+}
+
 /*
  * This returns the number of iovecs needed to log the given efi item.
  * We only need 1 iovec for an efi item.  It just logs the efi_log_format
@@ -74,7 +96,8 @@ xfs_efi_item_format(
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
        uint                    size;
 
-       ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+       ASSERT(atomic_read(&efip->efi_next_extent) ==
+                               efip->efi_format.efi_nextents);
 
        efip->efi_format.efi_type = XFS_LI_EFI;
 
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
 }
 
 /*
- * While EFIs cannot really be pinned, the unpin operation is the
- * last place at which the EFI is manipulated during a transaction.
- * Here we coordinate with xfs_efi_cancel() to determine who gets to
- * free the EFI.
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
+ * which the EFI is manipulated during a transaction.  If we are being asked to
+ * remove the EFI it's because the transaction has been cancelled and by
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it.  Otherwise coordinate with xfs_efi_release() (via
+ * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
  */
 STATIC void
 xfs_efi_item_unpin(
@@ -110,20 +135,14 @@ xfs_efi_item_unpin(
        int                     remove)
 {
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-       struct xfs_ail          *ailp = lip->li_ailp;
-
-       spin_lock(&ailp->xa_lock);
-       if (efip->efi_flags & XFS_EFI_CANCELED) {
-               if (remove)
-                       xfs_trans_del_item(lip);
 
-               /* xfs_trans_ail_delete() drops the AIL lock. */
-               xfs_trans_ail_delete(ailp, lip);
+       if (remove) {
+               ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
+               xfs_trans_del_item(lip);
                xfs_efi_item_free(efip);
-       } else {
-               efip->efi_flags |= XFS_EFI_COMMITTED;
-               spin_unlock(&ailp->xa_lock);
+               return;
        }
+       __xfs_efi_release(efip);
 }
 
 /*
@@ -152,16 +171,20 @@ xfs_efi_item_unlock(
 }
 
 /*
- * The EFI is logged only once and cannot be moved in the log, so
- * simply return the lsn at which it's been logged.  The canceled
- * flag is not paid any attention here.  Checking for that is delayed
- * until the EFI is unpinned.
+ * The EFI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged.  For bulk transaction committed
+ * processing, the EFI may be processed but not yet unpinned prior to the EFD
+ * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
+ * when processing the EFD.
  */
 STATIC xfs_lsn_t
 xfs_efi_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+       struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+
+       set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
        return lsn;
 }
 
@@ -230,6 +253,7 @@ xfs_efi_init(
        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
+       atomic_set(&efip->efi_next_extent, 0);
 
        return efip;
 }
@@ -289,37 +313,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 }
 
 /*
- * This is called by the efd item code below to release references to
- * the given efi item.  Each efd calls this with the number of
- * extents that it has logged, and when the sum of these reaches
- * the total number of extents logged by this efi item we can free
- * the efi item.
- *
- * Freeing the efi item requires that we remove it from the AIL.
- * We'll use the AIL lock to protect our counters as well as
- * the removal from the AIL.
+ * This is called by the efd item code below to release references to the given
+ * efi item.  Each efd calls this with the number of extents that it has
+ * logged, and when the sum of these reaches the total number of extents logged
+ * by this efi item we can free the efi item.
  */
 void
 xfs_efi_release(xfs_efi_log_item_t     *efip,
                uint                    nextents)
 {
-       struct xfs_ail          *ailp = efip->efi_item.li_ailp;
-       int                     extents_left;
-
-       ASSERT(efip->efi_next_extent > 0);
-       ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-
-       spin_lock(&ailp->xa_lock);
-       ASSERT(efip->efi_next_extent >= nextents);
-       efip->efi_next_extent -= nextents;
-       extents_left = efip->efi_next_extent;
-       if (extents_left == 0) {
-               /* xfs_trans_ail_delete() drops the AIL lock. */
-               xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-               xfs_efi_item_free(efip);
-       } else {
-               spin_unlock(&ailp->xa_lock);
-       }
+       ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
+       if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+               __xfs_efi_release(efip);
 }
 
 static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
index 0d22c56..375f68e 100644 (file)
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
 #define        XFS_EFI_MAX_FAST_EXTENTS        16
 
 /*
- * Define EFI flags.
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
  */
-#define        XFS_EFI_RECOVERED       0x1
-#define        XFS_EFI_COMMITTED       0x2
-#define        XFS_EFI_CANCELED        0x4
+#define        XFS_EFI_RECOVERED       1
+#define        XFS_EFI_COMMITTED       2
 
 /*
  * This is the "extent free intention" log item.  It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
  */
 typedef struct xfs_efi_log_item {
        xfs_log_item_t          efi_item;
-       uint                    efi_flags;      /* misc flags */
-       uint                    efi_next_extent;
+       atomic_t                efi_next_extent;
+       unsigned long           efi_flags;      /* misc flags */
        xfs_efi_log_format_t    efi_format;
 } xfs_efi_log_item_t;
 
index a7c116e..f56d30e 100644 (file)
@@ -374,6 +374,7 @@ xfs_growfs_data_private(
                mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
        } else
                mp->m_maxicount = 0;
+       xfs_set_low_space_thresholds(mp);
 
        /* update secondary superblocks. */
        for (agno = 1; agno < nagcount; agno++) {
index d7de5a3..cb9b6d1 100644 (file)
 #include "xfs_trace.h"
 
 
+/*
+ * Define xfs inode iolock lockdep classes. We need to ensure that all active
+ * inodes are considered the same for lockdep purposes, including inodes that
+ * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
+ * guarantee the locks are considered the same when there are multiple lock
+ * initialisation siteÑ•. Also, define a reclaimable inode class so it is
+ * obvious in lockdep reports which class the report is against.
+ */
+static struct lock_class_key xfs_iolock_active;
+struct lock_class_key xfs_iolock_reclaimable;
+
 /*
  * Allocate and initialise an xfs_inode.
  */
@@ -69,8 +80,11 @@ xfs_inode_alloc(
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+       ASSERT(ip->i_ino == 0);
 
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+       lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                       &xfs_iolock_active, "xfs_iolock_active");
 
        /* initialise the xfs inode */
        ip->i_ino = ino;
@@ -85,9 +99,6 @@ xfs_inode_alloc(
        ip->i_size = 0;
        ip->i_new_size = 0;
 
-       /* prevent anyone from using this yet */
-       VFS_I(ip)->i_state = I_NEW;
-
        return ip;
 }
 
@@ -145,7 +156,18 @@ xfs_inode_free(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
 
-       call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback);
+       /*
+        * Because we use RCU freeing we need to ensure the inode always
+        * appears to be reclaimed with an invalid inode number when in the
+        * free state. The ip->i_flags_lock provides the barrier against lookup
+        * races.
+        */
+       spin_lock(&ip->i_flags_lock);
+       ip->i_flags = XFS_IRECLAIM;
+       ip->i_ino = 0;
+       spin_unlock(&ip->i_flags_lock);
+
+       call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 }
 
 /*
@@ -155,14 +177,29 @@ static int
 xfs_iget_cache_hit(
        struct xfs_perag        *pag,
        struct xfs_inode        *ip,
+       xfs_ino_t               ino,
        int                     flags,
-       int                     lock_flags) __releases(pag->pag_ici_lock)
+       int                     lock_flags) __releases(RCU)
 {
        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
        int                     error;
 
+       /*
+        * check for re-use of an inode within an RCU grace period due to the
+        * radix tree nodes not being updated yet. We monitor for this by
+        * setting the inode number to zero before freeing the inode structure.
+        * If the inode has been reallocated and set up, then the inode number
+        * will not match, so check for that, too.
+        */
        spin_lock(&ip->i_flags_lock);
+       if (ip->i_ino != ino) {
+               trace_xfs_iget_skip(ip);
+               XFS_STATS_INC(xs_ig_frecycle);
+               error = EAGAIN;
+               goto out_error;
+       }
+
 
        /*
         * If we are racing with another cache hit that is currently
@@ -205,7 +242,7 @@ xfs_iget_cache_hit(
                ip->i_flags |= XFS_IRECLAIM;
 
                spin_unlock(&ip->i_flags_lock);
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
 
                error = -inode_init_always(mp->m_super, inode);
                if (error) {
@@ -213,7 +250,7 @@ xfs_iget_cache_hit(
                         * Re-initializing the inode failed, and we are in deep
                         * trouble.  Try to re-add it to the reclaim list.
                         */
-                       read_lock(&pag->pag_ici_lock);
+                       rcu_read_lock();
                        spin_lock(&ip->i_flags_lock);
 
                        ip->i_flags &= ~XFS_INEW;
@@ -223,14 +260,20 @@ xfs_iget_cache_hit(
                        goto out_error;
                }
 
-               write_lock(&pag->pag_ici_lock);
+               spin_lock(&pag->pag_ici_lock);
                spin_lock(&ip->i_flags_lock);
                ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
                ip->i_flags |= XFS_INEW;
                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
                inode->i_state = I_NEW;
+
+               ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+               mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+               lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                               &xfs_iolock_active, "xfs_iolock_active");
+
                spin_unlock(&ip->i_flags_lock);
-               write_unlock(&pag->pag_ici_lock);
+               spin_unlock(&pag->pag_ici_lock);
        } else {
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
@@ -241,7 +284,7 @@ xfs_iget_cache_hit(
 
                /* We've got a live one. */
                spin_unlock(&ip->i_flags_lock);
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
                trace_xfs_iget_hit(ip);
        }
 
@@ -255,7 +298,7 @@ xfs_iget_cache_hit(
 
 out_error:
        spin_unlock(&ip->i_flags_lock);
-       read_unlock(&pag->pag_ici_lock);
+       rcu_read_unlock();
        return error;
 }
 
@@ -308,7 +351,7 @@ xfs_iget_cache_miss(
                        BUG();
        }
 
-       write_lock(&pag->pag_ici_lock);
+       spin_lock(&pag->pag_ici_lock);
 
        /* insert the new inode */
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -323,14 +366,14 @@ xfs_iget_cache_miss(
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
 
-       write_unlock(&pag->pag_ici_lock);
+       spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
 
        *ipp = ip;
        return 0;
 
 out_preload_end:
-       write_unlock(&pag->pag_ici_lock);
+       spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
@@ -377,7 +420,7 @@ xfs_iget(
        xfs_agino_t     agino;
 
        /* reject inode numbers outside existing AGs */
-       if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+       if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return EINVAL;
 
        /* get the perag structure and ensure that it's inode capable */
@@ -386,15 +429,15 @@ xfs_iget(
 
 again:
        error = 0;
-       read_lock(&pag->pag_ici_lock);
+       rcu_read_lock();
        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 
        if (ip) {
-               error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+               error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
                if (error)
                        goto out_error_or_again;
        } else {
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
                XFS_STATS_INC(xs_ig_missed);
 
                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
index 108c7a0..be7cf62 100644 (file)
@@ -887,7 +887,7 @@ xfs_iread(
         * around for a while.  This helps to keep recently accessed
         * meta-data in-core longer.
         */
-       XFS_BUF_SET_REF(bp, XFS_INO_REF);
+       xfs_buf_set_ref(bp, XFS_INO_REF);
 
        /*
         * Use xfs_trans_brelse() to release the buffer containing the
@@ -2000,16 +2000,32 @@ xfs_ifree_cluster(
                 */
                for (i = 0; i < ninodes; i++) {
 retry:
-                       read_lock(&pag->pag_ici_lock);
+                       rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
 
-                       /* Inode not in memory or stale, nothing to do */
-                       if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
-                               read_unlock(&pag->pag_ici_lock);
+                       /* Inode not in memory, nothing to do */
+                       if (!ip) {
+                               rcu_read_unlock();
                                continue;
                        }
 
+                       /*
+                        * because this is an RCU protected lookup, we could
+                        * find a recently freed or even reallocated inode
+                        * during the lookup. We need to check under the
+                        * i_flags_lock for a valid inode here. Skip it if it
+                        * is not valid, the wrong inode or stale.
+                        */
+                       spin_lock(&ip->i_flags_lock);
+                       if (ip->i_ino != inum + i ||
+                           __xfs_iflags_test(ip, XFS_ISTALE)) {
+                               spin_unlock(&ip->i_flags_lock);
+                               rcu_read_unlock();
+                               continue;
+                       }
+                       spin_unlock(&ip->i_flags_lock);
+
                        /*
                         * Don't try to lock/unlock the current inode, but we
                         * _cannot_ skip the other inodes that we did not find
@@ -2019,11 +2035,11 @@ retry:
                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                               read_unlock(&pag->pag_ici_lock);
+                               rcu_read_unlock();
                                delay(1);
                                goto retry;
                        }
-                       read_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
 
                        xfs_iflock(ip);
                        xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
 
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-       read_lock(&pag->pag_ici_lock);
+       rcu_read_lock();
        /* really need a gang lookup range call here */
        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
                                        first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
                iq = ilist[i];
                if (iq == ip)
                        continue;
-               /* if the inode lies outside this cluster, we're done. */
-               if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
-                       break;
+
+               /*
+                * because this is an RCU protected lookup, we could find a
+                * recently freed or even reallocated inode during the lookup.
+                * We need to check under the i_flags_lock for a valid inode
+                * here. Skip it if it is not valid or the wrong inode.
+                */
+               spin_lock(&ip->i_flags_lock);
+               if (!ip->i_ino ||
+                   (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                       spin_unlock(&ip->i_flags_lock);
+                       continue;
+               }
+               spin_unlock(&ip->i_flags_lock);
+
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
        }
 
 out_free:
-       read_unlock(&pag->pag_ici_lock);
+       rcu_read_unlock();
        kmem_free(ilist);
 out_put:
        xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
         * Corruption detected in the clustering loop.  Invalidate the
         * inode buffer and shut down the filesystem.
         */
-       read_unlock(&pag->pag_ici_lock);
+       rcu_read_unlock();
        /*
         * Clean up the buffer.  If it was B_DELWRI, just release it --
         * brelse can handle it with no problems.  If not, shut down the
index fb2ca2e..5c95fa8 100644 (file)
@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
  * In-core inode flags.
  */
-#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
-#define XFS_ISTALE     0x0002  /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
-#define XFS_INEW       0x0008  /* inode has just been allocated */
-#define XFS_IFILESTREAM        0x0010  /* inode is in a filestream directory */
-#define XFS_ITRUNCATED 0x0020  /* truncated down so flush-on-close */
+#define XFS_IRECLAIM           0x0001  /* started reclaiming this inode */
+#define XFS_ISTALE             0x0002  /* inode has been staled */
+#define XFS_IRECLAIMABLE       0x0004  /* inode can be reclaimed */
+#define XFS_INEW               0x0008  /* inode has just been allocated */
+#define XFS_IFILESTREAM                0x0010  /* inode is in a filestream directory */
+#define XFS_ITRUNCATED         0x0020  /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE     0x0040  /* dirty release already seen */
 
 /*
  * Flags for inode locking.
@@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_IOLOCK_DEP(flags)  (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_DEP(flags)   (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
 
+extern struct lock_class_key xfs_iolock_reclaimable;
+
 /*
  * Flags for xfs_itruncate_start().
  */
index 7c8d30c..fd4f398 100644 (file)
@@ -842,15 +842,64 @@ xfs_inode_item_destroy(
  * flushed to disk.  It is responsible for removing the inode item
  * from the AIL if it has not been re-logged, and unlocking the inode's
  * flush lock.
+ *
+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
+ * list for other inodes that will run this function. We remove them from the
+ * buffer list so we can process all the inode IO completions in one AIL lock
+ * traversal.
  */
 void
 xfs_iflush_done(
        struct xfs_buf          *bp,
        struct xfs_log_item     *lip)
 {
-       struct xfs_inode_log_item *iip = INODE_ITEM(lip);
-       xfs_inode_t             *ip = iip->ili_inode;
+       struct xfs_inode_log_item *iip;
+       struct xfs_log_item     *blip;
+       struct xfs_log_item     *next;
+       struct xfs_log_item     *prev;
        struct xfs_ail          *ailp = lip->li_ailp;
+       int                     need_ail = 0;
+
+       /*
+        * Scan the buffer IO completions for other inodes being completed and
+        * attach them to the current inode log item.
+        */
+       blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+       prev = NULL;
+       while (blip != NULL) {
+               if (lip->li_cb != xfs_iflush_done) {
+                       prev = blip;
+                       blip = blip->li_bio_list;
+                       continue;
+               }
+
+               /* remove from list */
+               next = blip->li_bio_list;
+               if (!prev) {
+                       XFS_BUF_SET_FSPRIVATE(bp, next);
+               } else {
+                       prev->li_bio_list = next;
+               }
+
+               /* add to current list */
+               blip->li_bio_list = lip->li_bio_list;
+               lip->li_bio_list = blip;
+
+               /*
+                * while we have the item, do the unlocked check for needing
+                * the AIL lock.
+                */
+               iip = INODE_ITEM(blip);
+               if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+                       need_ail++;
+
+               blip = next;
+       }
+
+       /* make sure we capture the state of the initial inode. */
+       iip = INODE_ITEM(lip);
+       if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+               need_ail++;
 
        /*
         * We only want to pull the item from the AIL if it is
@@ -861,28 +910,37 @@ xfs_iflush_done(
         * the lock since it's cheaper, and then we recheck while
         * holding the lock before removing the inode from the AIL.
         */
-       if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
+       if (need_ail) {
+               struct xfs_log_item *log_items[need_ail];
+               int i = 0;
                spin_lock(&ailp->xa_lock);
-               if (lip->li_lsn == iip->ili_flush_lsn) {
-                       /* xfs_trans_ail_delete() drops the AIL lock. */
-                       xfs_trans_ail_delete(ailp, lip);
-               } else {
-                       spin_unlock(&ailp->xa_lock);
+               for (blip = lip; blip; blip = blip->li_bio_list) {
+                       iip = INODE_ITEM(blip);
+                       if (iip->ili_logged &&
+                           blip->li_lsn == iip->ili_flush_lsn) {
+                               log_items[i++] = blip;
+                       }
+                       ASSERT(i <= need_ail);
                }
+               /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+               xfs_trans_ail_delete_bulk(ailp, log_items, i);
        }
 
-       iip->ili_logged = 0;
 
        /*
-        * Clear the ili_last_fields bits now that we know that the
-        * data corresponding to them is safely on disk.
+        * clean up and unlock the flush lock now we are done. We can clear the
+        * ili_last_fields bits now that we know that the data corresponding to
+        * them is safely on disk.
         */
-       iip->ili_last_fields = 0;
+       for (blip = lip; blip; blip = next) {
+               next = blip->li_bio_list;
+               blip->li_bio_list = NULL;
 
-       /*
-        * Release the inode's flush lock since we're done with it.
-        */
-       xfs_ifunlock(ip);
+               iip = INODE_ITEM(blip);
+               iip->ili_logged = 0;
+               iip->ili_last_fields = 0;
+               xfs_ifunlock(iip->ili_inode);
+       }
 }
 
 /*
index 2057614..55582bd 100644 (file)
 
 #define XFS_WRITEIO_ALIGN(mp,off)      (((off) >> mp->m_writeio_log) \
                                                << mp->m_writeio_log)
-#define XFS_STRAT_WRITE_IMAPS  2
 #define XFS_WRITE_IMAPS                XFS_BMAP_MAX_NMAP
 
-STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                                 int, struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-                                struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-                               struct xfs_bmbt_irec *, int *);
-
-int
-xfs_iomap(
-       struct xfs_inode        *ip,
-       xfs_off_t               offset,
-       ssize_t                 count,
-       int                     flags,
-       struct xfs_bmbt_irec    *imap,
-       int                     *nimaps,
-       int                     *new)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           offset_fsb, end_fsb;
-       int                     error = 0;
-       int                     lockmode = 0;
-       int                     bmapi_flags = 0;
-
-       ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-
-       *new = 0;
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
-
-       switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
-       case BMAPI_READ:
-               lockmode = xfs_ilock_map_shared(ip);
-               bmapi_flags = XFS_BMAPI_ENTIRE;
-               break;
-       case BMAPI_WRITE:
-               lockmode = XFS_ILOCK_EXCL;
-               if (flags & BMAPI_IGNSTATE)
-                       bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
-               xfs_ilock(ip, lockmode);
-               break;
-       case BMAPI_ALLOCATE:
-               lockmode = XFS_ILOCK_SHARED;
-               bmapi_flags = XFS_BMAPI_ENTIRE;
-
-               /* Attempt non-blocking lock */
-               if (flags & BMAPI_TRYLOCK) {
-                       if (!xfs_ilock_nowait(ip, lockmode))
-                               return XFS_ERROR(EAGAIN);
-               } else {
-                       xfs_ilock(ip, lockmode);
-               }
-               break;
-       default:
-               BUG();
-       }
-
-       ASSERT(offset <= mp->m_maxioffset);
-       if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
-               count = mp->m_maxioffset - offset;
-       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-       offset_fsb = XFS_B_TO_FSBT(mp, offset);
-
-       error = xfs_bmapi(NULL, ip, offset_fsb,
-                       (xfs_filblks_t)(end_fsb - offset_fsb),
-                       bmapi_flags,  NULL, 0, imap,
-                       nimaps, NULL);
-
-       if (error)
-               goto out;
-
-       switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
-       case BMAPI_WRITE:
-               /* If we found an extent, return it */
-               if (*nimaps &&
-                   (imap->br_startblock != HOLESTARTBLOCK) &&
-                   (imap->br_startblock != DELAYSTARTBLOCK)) {
-                       trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                       break;
-               }
-
-               if (flags & BMAPI_DIRECT) {
-                       error = xfs_iomap_write_direct(ip, offset, count, flags,
-                                                      imap, nimaps);
-               } else {
-                       error = xfs_iomap_write_delay(ip, offset, count, flags,
-                                                     imap, nimaps);
-               }
-               if (!error) {
-                       trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
-               }
-               *new = 1;
-               break;
-       case BMAPI_ALLOCATE:
-               /* If we found an extent, return it */
-               xfs_iunlock(ip, lockmode);
-               lockmode = 0;
-
-               if (*nimaps && !isnullstartblock(imap->br_startblock)) {
-                       trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                       break;
-               }
-
-               error = xfs_iomap_write_allocate(ip, offset, count,
-                                                imap, nimaps);
-               break;
-       }
-
-       ASSERT(*nimaps <= 1);
-
-out:
-       if (lockmode)
-               xfs_iunlock(ip, lockmode);
-       return XFS_ERROR(error);
-}
-
 STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
        return EFSCORRUPTED;
 }
 
-STATIC int
+int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-       int             flags,
        xfs_bmbt_irec_t *imap,
-       int             *nmaps)
+       int             nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
                if (error)
                        goto error_out;
        } else {
-               if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+               if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
                                        imap->br_blockcount +
                                        imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
        xfs_trans_ijoin(tp, ip);
 
        bmapi_flag = XFS_BMAPI_WRITE;
-       if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
+       if (offset < ip->i_size || extsz)
                bmapi_flag |= XFS_BMAPI_PREALLOC;
 
        /*
@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
                goto error_out;
        }
 
-       *nmaps = 1;
        return 0;
 
 error0:        /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0:     /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 
 error1:        /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-       *nmaps = 0;     /* nothing set-up here */
 
 error_out:
        return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
  * If the caller is doing a write at the end of the file, then extend the
  * allocation out to the file system's write iosize.  We clean up any extra
  * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
  */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-       int             ioflag,
        xfs_bmbt_irec_t *imap,
        int             nimaps,
        int             *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
        xfs_filblks_t   count_fsb;
        xfs_fsblock_t   firstblock;
        int             n, error, imaps;
+       int             found_delalloc = 0;
 
        *prealloc = 0;
        if ((offset + count) <= ip->i_size)
@@ -429,20 +310,66 @@ xfs_iomap_eof_want_preallocate(
                                return 0;
                        start_fsb += imap[n].br_blockcount;
                        count_fsb -= imap[n].br_blockcount;
+
+                       if (imap[n].br_startblock == DELAYSTARTBLOCK)
+                               found_delalloc = 1;
                }
        }
-       *prealloc = 1;
+       if (!found_delalloc)
+               *prealloc = 1;
        return 0;
 }
 
-STATIC int
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+       struct xfs_mount        *mp,
+       struct xfs_inode        *ip)
+{
+       xfs_fsblock_t           alloc_blocks = 0;
+
+       if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+               int shift = 0;
+               int64_t freesp;
+
+               alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
+               alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+                                       rounddown_pow_of_two(alloc_blocks));
+
+               xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+               freesp = mp->m_sb.sb_fdblocks;
+               if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+                       shift = 2;
+                       if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+                               shift++;
+                       if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+                               shift++;
+                       if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+                               shift++;
+                       if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+                               shift++;
+               }
+               if (shift)
+                       alloc_blocks >>= shift;
+       }
+
+       if (alloc_blocks < mp->m_writeio_blocks)
+               alloc_blocks = mp->m_writeio_blocks;
+
+       return alloc_blocks;
+}
+
+int
 xfs_iomap_write_delay(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-       int             ioflag,
-       xfs_bmbt_irec_t *ret_imap,
-       int             *nmaps)
+       xfs_bmbt_irec_t *ret_imap)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -469,16 +396,19 @@ xfs_iomap_write_delay(
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
+
        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-                               ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
+                               imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
 
 retry:
        if (prealloc) {
+               xfs_fsblock_t   alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
+
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-               last_fsb = ioalign + mp->m_writeio_blocks;
+               last_fsb = ioalign + alloc_blocks;
        } else {
                last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
        }
@@ -496,22 +426,31 @@ retry:
                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
                          &nimaps, NULL);
-       if (error && (error != ENOSPC))
+       switch (error) {
+       case 0:
+       case ENOSPC:
+       case EDQUOT:
+               break;
+       default:
                return XFS_ERROR(error);
+       }
 
        /*
-        * If bmapi returned us nothing, and if we didn't get back EDQUOT,
-        * then we must have run out of space - flush all other inodes with
-        * delalloc blocks and retry without EOF preallocation.
+        * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
+        * ENOSPC, * flush all other inodes with delalloc blocks to free up
+        * some of the excess reserved metadata space. For both cases, retry
+        * without EOF preallocation.
         */
        if (nimaps == 0) {
                trace_xfs_delalloc_enospc(ip, offset, count);
                if (flushed)
-                       return XFS_ERROR(ENOSPC);
+                       return XFS_ERROR(error ? error : ENOSPC);
 
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               xfs_flush_inodes(ip);
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               if (error == ENOSPC) {
+                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                       xfs_flush_inodes(ip);
+                       xfs_ilock(ip, XFS_ILOCK_EXCL);
+               }
 
                flushed = 1;
                error = 0;
@@ -523,8 +462,6 @@ retry:
                return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
 
        *ret_imap = imap[0];
-       *nmaps = 1;
-
        return 0;
 }
 
@@ -538,13 +475,12 @@ retry:
  * We no longer bother to look at the incoming map - all we have to
  * guarantee is that whatever we allocate fills the required range.
  */
-STATIC int
+int
 xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-       xfs_bmbt_irec_t *imap,
-       int             *retmap)
+       xfs_bmbt_irec_t *imap)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb, last_block;
@@ -557,8 +493,6 @@ xfs_iomap_write_allocate(
        int             error = 0;
        int             nres;
 
-       *retmap = 0;
-
        /*
         * Make sure that the dquots are there.
         */
@@ -680,7 +614,6 @@ xfs_iomap_write_allocate(
                if ((offset_fsb >= imap->br_startoff) &&
                    (offset_fsb < (imap->br_startoff +
                                   imap->br_blockcount))) {
-                       *retmap = 1;
                        XFS_STATS_INC(xs_xstrat_quick);
                        return 0;
                }
index 7748a43..8061576 100644 (file)
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
 
-/* base extent manipulation calls */
-#define BMAPI_READ     (1 << 0)        /* read extents */
-#define BMAPI_WRITE    (1 << 1)        /* create extents */
-#define BMAPI_ALLOCATE (1 << 2)        /* delayed allocate to real extents */
-
-/* modifiers */
-#define BMAPI_IGNSTATE (1 << 4)        /* ignore unwritten state on read */
-#define BMAPI_DIRECT   (1 << 5)        /* direct instead of buffered write */
-#define BMAPI_MMA      (1 << 6)        /* allocate for mmap write */
-#define BMAPI_TRYLOCK  (1 << 7)        /* non-blocking request */
-
-#define BMAPI_FLAGS \
-       { BMAPI_READ,           "READ" }, \
-       { BMAPI_WRITE,          "WRITE" }, \
-       { BMAPI_ALLOCATE,       "ALLOCATE" }, \
-       { BMAPI_IGNSTATE,       "IGNSTATE" }, \
-       { BMAPI_DIRECT,         "DIRECT" }, \
-       { BMAPI_TRYLOCK,        "TRYLOCK" }
-
 struct xfs_inode;
 struct xfs_bmbt_irec;
 
-extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
-                    struct xfs_bmbt_irec *, int *, int *);
+extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
+                       struct xfs_bmbt_irec *, int);
+extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+                       struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+                       struct xfs_bmbt_irec *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 
 #endif /* __XFS_IOMAP_H__*/
index cee4ab9..0bf24b1 100644 (file)
@@ -47,7 +47,7 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t   *mp,
                                xfs_buftarg_t   *log_target,
                                xfs_daddr_t     blk_offset,
                                int             num_bblks);
-STATIC int      xlog_space_left(xlog_t *log, int cycle, int bytes);
+STATIC int      xlog_space_left(struct log *log, atomic64_t *head);
 STATIC int      xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void     xlog_dealloc_log(xlog_t *log);
 
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t       *log, xlog_in_core_t *iclog);
 /* local functions to manipulate grant head */
 STATIC int  xlog_grant_log_space(xlog_t                *log,
                                 xlog_ticket_t  *xtic);
-STATIC void xlog_grant_push_ail(xfs_mount_t    *mp,
+STATIC void xlog_grant_push_ail(struct log     *log,
                                int             need_bytes);
 STATIC void xlog_regrant_reserve_log_space(xlog_t       *log,
                                           xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t    *log,
 
 #if defined(DEBUG)
 STATIC void    xlog_verify_dest_ptr(xlog_t *log, char *ptr);
-STATIC void    xlog_verify_grant_head(xlog_t *log, int equals);
+STATIC void    xlog_verify_grant_tail(struct log *log);
 STATIC void    xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
                                  int count, boolean_t syncing);
 STATIC void    xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
                                     xfs_lsn_t tail_lsn);
 #else
 #define xlog_verify_dest_ptr(a,b)
-#define xlog_verify_grant_head(a,b)
+#define xlog_verify_grant_tail(a)
 #define xlog_verify_iclog(a,b,c,d)
 #define xlog_verify_tail_lsn(a,b,c)
 #endif
 
 STATIC int     xlog_iclogs_empty(xlog_t *log);
 
-
 static void
-xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
+xlog_grant_sub_space(
+       struct log      *log,
+       atomic64_t      *head,
+       int             bytes)
 {
-       if (*qp) {
-               tic->t_next         = (*qp);
-               tic->t_prev         = (*qp)->t_prev;
-               (*qp)->t_prev->t_next = tic;
-               (*qp)->t_prev       = tic;
-       } else {
-               tic->t_prev = tic->t_next = tic;
-               *qp = tic;
-       }
+       int64_t head_val = atomic64_read(head);
+       int64_t new, old;
 
-       tic->t_flags |= XLOG_TIC_IN_Q;
-}
+       do {
+               int     cycle, space;
 
-static void
-xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
-{
-       if (tic == tic->t_next) {
-               *qp = NULL;
-       } else {
-               *qp = tic->t_next;
-               tic->t_next->t_prev = tic->t_prev;
-               tic->t_prev->t_next = tic->t_next;
-       }
+               xlog_crack_grant_head_val(head_val, &cycle, &space);
 
-       tic->t_next = tic->t_prev = NULL;
-       tic->t_flags &= ~XLOG_TIC_IN_Q;
+               space -= bytes;
+               if (space < 0) {
+                       space += log->l_logsize;
+                       cycle--;
+               }
+
+               old = head_val;
+               new = xlog_assign_grant_head_val(cycle, space);
+               head_val = atomic64_cmpxchg(head, old, new);
+       } while (head_val != old);
 }
 
 static void
-xlog_grant_sub_space(struct log *log, int bytes)
+xlog_grant_add_space(
+       struct log      *log,
+       atomic64_t      *head,
+       int             bytes)
 {
-       log->l_grant_write_bytes -= bytes;
-       if (log->l_grant_write_bytes < 0) {
-               log->l_grant_write_bytes += log->l_logsize;
-               log->l_grant_write_cycle--;
-       }
-
-       log->l_grant_reserve_bytes -= bytes;
-       if ((log)->l_grant_reserve_bytes < 0) {
-               log->l_grant_reserve_bytes += log->l_logsize;
-               log->l_grant_reserve_cycle--;
-       }
+       int64_t head_val = atomic64_read(head);
+       int64_t new, old;
 
-}
+       do {
+               int             tmp;
+               int             cycle, space;
 
-static void
-xlog_grant_add_space_write(struct log *log, int bytes)
-{
-       int tmp = log->l_logsize - log->l_grant_write_bytes;
-       if (tmp > bytes)
-               log->l_grant_write_bytes += bytes;
-       else {
-               log->l_grant_write_cycle++;
-               log->l_grant_write_bytes = bytes - tmp;
-       }
-}
+               xlog_crack_grant_head_val(head_val, &cycle, &space);
 
-static void
-xlog_grant_add_space_reserve(struct log *log, int bytes)
-{
-       int tmp = log->l_logsize - log->l_grant_reserve_bytes;
-       if (tmp > bytes)
-               log->l_grant_reserve_bytes += bytes;
-       else {
-               log->l_grant_reserve_cycle++;
-               log->l_grant_reserve_bytes = bytes - tmp;
-       }
-}
+               tmp = log->l_logsize - space;
+               if (tmp > bytes)
+                       space += bytes;
+               else {
+                       space = bytes - tmp;
+                       cycle++;
+               }
 
-static inline void
-xlog_grant_add_space(struct log *log, int bytes)
-{
-       xlog_grant_add_space_write(log, bytes);
-       xlog_grant_add_space_reserve(log, bytes);
+               old = head_val;
+               new = xlog_assign_grant_head_val(cycle, space);
+               head_val = atomic64_cmpxchg(head, old, new);
+       } while (head_val != old);
 }
 
 static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
 
                trace_xfs_log_reserve(log, internal_ticket);
 
-               xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
+               xlog_grant_push_ail(log, internal_ticket->t_unit_res);
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
                /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
 
                trace_xfs_log_reserve(log, internal_ticket);
 
-               xlog_grant_push_ail(mp,
+               xlog_grant_push_ail(log,
                                    (internal_ticket->t_unit_res *
                                     internal_ticket->t_cnt));
                retval = xlog_grant_log_space(log, internal_ticket);
@@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
                      iclog->ic_state == XLOG_STATE_DIRTY)) {
                        if (!XLOG_FORCED_SHUTDOWN(log)) {
-                               sv_wait(&iclog->ic_force_wait, PMEM,
-                                       &log->l_icloglock, s);
+                               xlog_wait(&iclog->ic_force_wait,
+                                                       &log->l_icloglock);
                        } else {
                                spin_unlock(&log->l_icloglock);
                        }
@@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                        || iclog->ic_state == XLOG_STATE_DIRTY
                        || iclog->ic_state == XLOG_STATE_IOERROR) ) {
 
-                               sv_wait(&iclog->ic_force_wait, PMEM,
-                                       &log->l_icloglock, s);
+                               xlog_wait(&iclog->ic_force_wait,
+                                                       &log->l_icloglock);
                } else {
                        spin_unlock(&log->l_icloglock);
                }
@@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t     *mp,
 {
        xlog_ticket_t   *tic;
        xlog_t          *log = mp->m_log;
-       int             need_bytes, free_bytes, cycle, bytes;
+       int             need_bytes, free_bytes;
 
        if (XLOG_FORCED_SHUTDOWN(log))
                return;
 
-       if (tail_lsn == 0) {
-               /* needed since sync_lsn is 64 bits */
-               spin_lock(&log->l_icloglock);
-               tail_lsn = log->l_last_sync_lsn;
-               spin_unlock(&log->l_icloglock);
-       }
-
-       spin_lock(&log->l_grant_lock);
+       if (tail_lsn == 0)
+               tail_lsn = atomic64_read(&log->l_last_sync_lsn);
 
-       /* Also an invalid lsn.  1 implies that we aren't passing in a valid
-        * tail_lsn.
-        */
-       if (tail_lsn != 1) {
-               log->l_tail_lsn = tail_lsn;
-       }
+       /* tail_lsn == 1 implies that we weren't passed a valid value.  */
+       if (tail_lsn != 1)
+               atomic64_set(&log->l_tail_lsn, tail_lsn);
 
-       if ((tic = log->l_write_headq)) {
+       if (!list_empty_careful(&log->l_writeq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-               cycle = log->l_grant_write_cycle;
-               bytes = log->l_grant_write_bytes;
-               free_bytes = xlog_space_left(log, cycle, bytes);
-               do {
+               spin_lock(&log->l_grant_write_lock);
+               free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+               list_for_each_entry(tic, &log->l_writeq, t_queue) {
                        ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
 
                        if (free_bytes < tic->t_unit_res && tail_lsn != 1)
                                break;
                        tail_lsn = 0;
                        free_bytes -= tic->t_unit_res;
-                       sv_signal(&tic->t_wait);
-                       tic = tic->t_next;
-               } while (tic != log->l_write_headq);
+                       trace_xfs_log_regrant_write_wake_up(log, tic);
+                       wake_up(&tic->t_wait);
+               }
+               spin_unlock(&log->l_grant_write_lock);
        }
-       if ((tic = log->l_reserve_headq)) {
+
+       if (!list_empty_careful(&log->l_reserveq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-               cycle = log->l_grant_reserve_cycle;
-               bytes = log->l_grant_reserve_bytes;
-               free_bytes = xlog_space_left(log, cycle, bytes);
-               do {
+               spin_lock(&log->l_grant_reserve_lock);
+               free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+               list_for_each_entry(tic, &log->l_reserveq, t_queue) {
                        if (tic->t_flags & XLOG_TIC_PERM_RESERV)
                                need_bytes = tic->t_unit_res*tic->t_cnt;
                        else
@@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t     *mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= need_bytes;
-                       sv_signal(&tic->t_wait);
-                       tic = tic->t_next;
-               } while (tic != log->l_reserve_headq);
+                       trace_xfs_log_grant_wake_up(log, tic);
+                       wake_up(&tic->t_wait);
+               }
+               spin_unlock(&log->l_grant_reserve_lock);
        }
-       spin_unlock(&log->l_grant_lock);
-}      /* xfs_log_move_tail */
+}
 
 /*
  * Determine if we have a transaction that has gone to disk
@@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
  * We may be holding the log iclog lock upon entering this routine.
  */
 xfs_lsn_t
-xlog_assign_tail_lsn(xfs_mount_t *mp)
+xlog_assign_tail_lsn(
+       struct xfs_mount        *mp)
 {
-       xfs_lsn_t tail_lsn;
-       xlog_t    *log = mp->m_log;
+       xfs_lsn_t               tail_lsn;
+       struct log              *log = mp->m_log;
 
        tail_lsn = xfs_trans_ail_tail(mp->m_ail);
-       spin_lock(&log->l_grant_lock);
-       if (tail_lsn != 0) {
-               log->l_tail_lsn = tail_lsn;
-       } else {
-               tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
-       }
-       spin_unlock(&log->l_grant_lock);
+       if (!tail_lsn)
+               tail_lsn = atomic64_read(&log->l_last_sync_lsn);
 
+       atomic64_set(&log->l_tail_lsn, tail_lsn);
        return tail_lsn;
-}      /* xlog_assign_tail_lsn */
-
+}
 
 /*
  * Return the space in the log between the tail and the head.  The head
@@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
  * result is that we return the size of the log as the amount of space left.
  */
 STATIC int
-xlog_space_left(xlog_t *log, int cycle, int bytes)
-{
-       int free_bytes;
-       int tail_bytes;
-       int tail_cycle;
-
-       tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
-       tail_cycle = CYCLE_LSN(log->l_tail_lsn);
-       if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
-               free_bytes = log->l_logsize - (bytes - tail_bytes);
-       } else if ((tail_cycle + 1) < cycle) {
+xlog_space_left(
+       struct log      *log,
+       atomic64_t      *head)
+{
+       int             free_bytes;
+       int             tail_bytes;
+       int             tail_cycle;
+       int             head_cycle;
+       int             head_bytes;
+
+       xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+       xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+       tail_bytes = BBTOB(tail_bytes);
+       if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+               free_bytes = log->l_logsize - (head_bytes - tail_bytes);
+       else if (tail_cycle + 1 < head_cycle)
                return 0;
-       } else if (tail_cycle < cycle) {
-               ASSERT(tail_cycle == (cycle - 1));
-               free_bytes = tail_bytes - bytes;
+       else if (tail_cycle < head_cycle) {
+               ASSERT(tail_cycle == (head_cycle - 1));
+               free_bytes = tail_bytes - head_bytes;
        } else {
                /*
                 * The reservation head is behind the tail.
@@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes)
                        "xlog_space_left: head behind tail\n"
                        "  tail_cycle = %d, tail_bytes = %d\n"
                        "  GH   cycle = %d, GH   bytes = %d",
-                       tail_cycle, tail_bytes, cycle, bytes);
+                       tail_cycle, tail_bytes, head_cycle, head_bytes);
                ASSERT(0);
                free_bytes = log->l_logsize;
        }
        return free_bytes;
-}      /* xlog_space_left */
+}
 
 
 /*
@@ -1047,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t      *mp,
        log->l_flags       |= XLOG_ACTIVE_RECOVERY;
 
        log->l_prev_block  = -1;
-       log->l_tail_lsn    = xlog_assign_lsn(1, 0);
        /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
-       log->l_last_sync_lsn = log->l_tail_lsn;
+       xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+       xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
        log->l_curr_cycle  = 1;     /* 0 is bad since this is initial value */
-       log->l_grant_reserve_cycle = 1;
-       log->l_grant_write_cycle = 1;
+       xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
+       xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
+       INIT_LIST_HEAD(&log->l_reserveq);
+       INIT_LIST_HEAD(&log->l_writeq);
+       spin_lock_init(&log->l_grant_reserve_lock);
+       spin_lock_init(&log->l_grant_write_lock);
 
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1094,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t        *mp,
        log->l_xbuf = bp;
 
        spin_lock_init(&log->l_icloglock);
-       spin_lock_init(&log->l_grant_lock);
-       sv_init(&log->l_flush_wait, 0, "flush_wait");
+       init_waitqueue_head(&log->l_flush_wait);
 
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1151,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t        *mp,
 
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-               sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
-               sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+               init_waitqueue_head(&iclog->ic_force_wait);
+               init_waitqueue_head(&iclog->ic_write_wait);
 
                iclogp = &iclog->ic_next;
        }
@@ -1167,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t      *mp,
 out_free_iclog:
        for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
                prev_iclog = iclog->ic_next;
-               if (iclog->ic_bp) {
-                       sv_destroy(&iclog->ic_force_wait);
-                       sv_destroy(&iclog->ic_write_wait);
+               if (iclog->ic_bp)
                        xfs_buf_free(iclog->ic_bp);
-               }
                kmem_free(iclog);
        }
        spinlock_destroy(&log->l_icloglock);
-       spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
 out_free_log:
        kmem_free(log);
@@ -1223,61 +1189,60 @@ xlog_commit_record(
  * water mark.  In this manner, we would be creating a low water mark.
  */
 STATIC void
-xlog_grant_push_ail(xfs_mount_t        *mp,
-                   int         need_bytes)
+xlog_grant_push_ail(
+       struct log      *log,
+       int             need_bytes)
 {
-    xlog_t     *log = mp->m_log;       /* pointer to the log */
-    xfs_lsn_t  tail_lsn;               /* lsn of the log tail */
-    xfs_lsn_t  threshold_lsn = 0;      /* lsn we'd like to be at */
-    int                free_blocks;            /* free blocks left to write to */
-    int                free_bytes;             /* free bytes left to write to */
-    int                threshold_block;        /* block in lsn we'd like to be at */
-    int                threshold_cycle;        /* lsn cycle we'd like to be at */
-    int                free_threshold;
-
-    ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
-
-    spin_lock(&log->l_grant_lock);
-    free_bytes = xlog_space_left(log,
-                                log->l_grant_reserve_cycle,
-                                log->l_grant_reserve_bytes);
-    tail_lsn = log->l_tail_lsn;
-    free_blocks = BTOBBT(free_bytes);
-
-    /*
-     * Set the threshold for the minimum number of free blocks in the
-     * log to the maximum of what the caller needs, one quarter of the
-     * log, and 256 blocks.
-     */
-    free_threshold = BTOBB(need_bytes);
-    free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
-    free_threshold = MAX(free_threshold, 256);
-    if (free_blocks < free_threshold) {
-       threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
-       threshold_cycle = CYCLE_LSN(tail_lsn);
+       xfs_lsn_t       threshold_lsn = 0;
+       xfs_lsn_t       last_sync_lsn;
+       int             free_blocks;
+       int             free_bytes;
+       int             threshold_block;
+       int             threshold_cycle;
+       int             free_threshold;
+
+       ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+
+       free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+       free_blocks = BTOBBT(free_bytes);
+
+       /*
+        * Set the threshold for the minimum number of free blocks in the
+        * log to the maximum of what the caller needs, one quarter of the
+        * log, and 256 blocks.
+        */
+       free_threshold = BTOBB(need_bytes);
+       free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+       free_threshold = MAX(free_threshold, 256);
+       if (free_blocks >= free_threshold)
+               return;
+
+       xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
+                                               &threshold_block);
+       threshold_block += free_threshold;
        if (threshold_block >= log->l_logBBsize) {
-           threshold_block -= log->l_logBBsize;
-           threshold_cycle += 1;
+               threshold_block -= log->l_logBBsize;
+               threshold_cycle += 1;
        }
-       threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block);
+       threshold_lsn = xlog_assign_lsn(threshold_cycle,
+                                       threshold_block);
+       /*
+        * Don't pass in an lsn greater than the lsn of the last
+        * log record known to be on disk. Use a snapshot of the last sync lsn
+        * so that it doesn't change between the compare and the set.
+        */
+       last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+       if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+               threshold_lsn = last_sync_lsn;
 
-       /* Don't pass in an lsn greater than the lsn of the last
-        * log record known to be on disk.
+       /*
+        * Get the transaction layer to kick the dirty buffers out to
+        * disk asynchronously. No point in trying to do this if
+        * the filesystem is shutting down.
         */
-       if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
-           threshold_lsn = log->l_last_sync_lsn;
-    }
-    spin_unlock(&log->l_grant_lock);
-
-    /*
-     * Get the transaction layer to kick the dirty buffers out to
-     * disk asynchronously. No point in trying to do this if
-     * the filesystem is shutting down.
-     */
-    if (threshold_lsn &&
-       !XLOG_FORCED_SHUTDOWN(log))
-           xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-}      /* xlog_grant_push_ail */
+       if (!XLOG_FORCED_SHUTDOWN(log))
+               xfs_trans_ail_push(log->l_ailp, threshold_lsn);
+}
 
 /*
  * The bdstrat callback function for log bufs. This gives us a central
@@ -1372,9 +1337,8 @@ xlog_sync(xlog_t          *log,
                 roundoff < BBTOB(1)));
 
        /* move grant heads by roundoff in sync */
-       spin_lock(&log->l_grant_lock);
-       xlog_grant_add_space(log, roundoff);
-       spin_unlock(&log->l_grant_lock);
+       xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
+       xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
 
        /* put cycle number in every block */
        xlog_pack_data(log, iclog, roundoff); 
@@ -1489,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log)
 
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
-               sv_destroy(&iclog->ic_force_wait);
-               sv_destroy(&iclog->ic_write_wait);
                xfs_buf_free(iclog->ic_bp);
                next_iclog = iclog->ic_next;
                kmem_free(iclog);
                iclog = next_iclog;
        }
        spinlock_destroy(&log->l_icloglock);
-       spinlock_destroy(&log->l_grant_lock);
 
        xfs_buf_free(log->l_xbuf);
        log->l_mp->m_log = NULL;
@@ -2232,7 +2193,7 @@ xlog_state_do_callback(
                                lowest_lsn = xlog_get_lowest_lsn(log);
                                if (lowest_lsn &&
                                    XFS_LSN_CMP(lowest_lsn,
-                                               be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+                                               be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
                                        iclog = iclog->ic_next;
                                        continue; /* Leave this iclog for
                                                   * another thread */
@@ -2240,23 +2201,21 @@ xlog_state_do_callback(
 
                                iclog->ic_state = XLOG_STATE_CALLBACK;
 
-                               spin_unlock(&log->l_icloglock);
 
-                               /* l_last_sync_lsn field protected by
-                                * l_grant_lock. Don't worry about iclog's lsn.
-                                * No one else can be here except us.
+                               /*
+                                * update the last_sync_lsn before we drop the
+                                * icloglock to ensure we are the only one that
+                                * can update it.
                                 */
-                               spin_lock(&log->l_grant_lock);
-                               ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn,
-                                      be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-                               log->l_last_sync_lsn =
-                                       be64_to_cpu(iclog->ic_header.h_lsn);
-                               spin_unlock(&log->l_grant_lock);
+                               ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+                                       be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+                               atomic64_set(&log->l_last_sync_lsn,
+                                       be64_to_cpu(iclog->ic_header.h_lsn));
 
-                       } else {
-                               spin_unlock(&log->l_icloglock);
+                       } else
                                ioerrors++;
-                       }
+
+                       spin_unlock(&log->l_icloglock);
 
                        /*
                         * Keep processing entries in the callback list until
@@ -2297,7 +2256,7 @@ xlog_state_do_callback(
                        xlog_state_clean_log(log);
 
                        /* wake up threads waiting in xfs_log_force() */
-                       sv_broadcast(&iclog->ic_force_wait);
+                       wake_up_all(&iclog->ic_force_wait);
 
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
@@ -2344,7 +2303,7 @@ xlog_state_do_callback(
        spin_unlock(&log->l_icloglock);
 
        if (wake)
-               sv_broadcast(&log->l_flush_wait);
+               wake_up_all(&log->l_flush_wait);
 }
 
 
@@ -2395,7 +2354,7 @@ xlog_state_done_syncing(
         * iclog buffer, we wake them all, one will get to do the
         * I/O, the others get to wait for the result.
         */
-       sv_broadcast(&iclog->ic_write_wait);
+       wake_up_all(&iclog->ic_write_wait);
        spin_unlock(&log->l_icloglock);
        xlog_state_do_callback(log, aborted, iclog);    /* also cleans log */
 }      /* xlog_state_done_syncing */
@@ -2444,7 +2403,7 @@ restart:
                XFS_STATS_INC(xs_log_noiclogs);
 
                /* Wait for log writes to have flushed */
-               sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
+               xlog_wait(&log->l_flush_wait, &log->l_icloglock);
                goto restart;
        }
 
@@ -2527,6 +2486,18 @@ restart:
  *
  * Once a ticket gets put onto the reserveq, it will only return after
  * the needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off the reserveq under the
+ * l_grant_reserve_lock, we only need to take that lock if we are going
+ * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * ticket was never added to the reserveq because the t_queue list head will be
+ * empty and we hold the only reference to it so it can safely be checked
+ * unlocked.
  */
 STATIC int
 xlog_grant_log_space(xlog_t       *log,
@@ -2534,24 +2505,27 @@ xlog_grant_log_space(xlog_t        *log,
 {
        int              free_bytes;
        int              need_bytes;
-#ifdef DEBUG
-       xfs_lsn_t        tail_lsn;
-#endif
-
 
 #ifdef DEBUG
        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                panic("grant Recovery problem");
 #endif
 
-       /* Is there space or do we need to sleep? */
-       spin_lock(&log->l_grant_lock);
-
        trace_xfs_log_grant_enter(log, tic);
 
+       need_bytes = tic->t_unit_res;
+       if (tic->t_flags & XFS_LOG_PERM_RESERV)
+               need_bytes *= tic->t_ocnt;
+
        /* something is already sleeping; insert new transaction at end */
-       if (log->l_reserve_headq) {
-               xlog_ins_ticketq(&log->l_reserve_headq, tic);
+       if (!list_empty_careful(&log->l_reserveq)) {
+               spin_lock(&log->l_grant_reserve_lock);
+               /* recheck the queue now we are locked */
+               if (list_empty(&log->l_reserveq)) {
+                       spin_unlock(&log->l_grant_reserve_lock);
+                       goto redo;
+               }
+               list_add_tail(&tic->t_queue, &log->l_reserveq);
 
                trace_xfs_log_grant_sleep1(log, tic);
 
@@ -2563,72 +2537,57 @@ xlog_grant_log_space(xlog_t        *log,
                        goto error_return;
 
                XFS_STATS_INC(xs_sleep_logspace);
-               sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+               xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+
                /*
                 * If we got an error, and the filesystem is shutting down,
                 * we'll catch it down below. So just continue...
                 */
                trace_xfs_log_grant_wake1(log, tic);
-               spin_lock(&log->l_grant_lock);
        }
-       if (tic->t_flags & XFS_LOG_PERM_RESERV)
-               need_bytes = tic->t_unit_res*tic->t_ocnt;
-       else
-               need_bytes = tic->t_unit_res;
 
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-               goto error_return;
+               goto error_return_unlocked;
 
-       free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
-                                    log->l_grant_reserve_bytes);
+       free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
        if (free_bytes < need_bytes) {
-               if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-                       xlog_ins_ticketq(&log->l_reserve_headq, tic);
+               spin_lock(&log->l_grant_reserve_lock);
+               if (list_empty(&tic->t_queue))
+                       list_add_tail(&tic->t_queue, &log->l_reserveq);
 
                trace_xfs_log_grant_sleep2(log, tic);
 
-               spin_unlock(&log->l_grant_lock);
-               xlog_grant_push_ail(log->l_mp, need_bytes);
-               spin_lock(&log->l_grant_lock);
-
-               XFS_STATS_INC(xs_sleep_logspace);
-               sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-
-               spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
 
-               trace_xfs_log_grant_wake2(log, tic);
+               xlog_grant_push_ail(log, need_bytes);
+
+               XFS_STATS_INC(xs_sleep_logspace);
+               xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
 
+               trace_xfs_log_grant_wake2(log, tic);
                goto redo;
-       } else if (tic->t_flags & XLOG_TIC_IN_Q)
-               xlog_del_ticketq(&log->l_reserve_headq, tic);
+       }
 
-       /* we've got enough space */
-       xlog_grant_add_space(log, need_bytes);
-#ifdef DEBUG
-       tail_lsn = log->l_tail_lsn;
-       /*
-        * Check to make sure the grant write head didn't just over lap the
-        * tail.  If the cycles are the same, we can't be overlapping.
-        * Otherwise, make sure that the cycles differ by exactly one and
-        * check the byte count.
-        */
-       if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-               ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-               ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
+       if (!list_empty(&tic->t_queue)) {
+               spin_lock(&log->l_grant_reserve_lock);
+               list_del_init(&tic->t_queue);
+               spin_unlock(&log->l_grant_reserve_lock);
        }
-#endif
+
+       /* we've got enough space */
+       xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
+       xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_grant_exit(log, tic);
-       xlog_verify_grant_head(log, 1);
-       spin_unlock(&log->l_grant_lock);
+       xlog_verify_grant_tail(log);
        return 0;
 
- error_return:
-       if (tic->t_flags & XLOG_TIC_IN_Q)
-               xlog_del_ticketq(&log->l_reserve_headq, tic);
-
+error_return_unlocked:
+       spin_lock(&log->l_grant_reserve_lock);
+error_return:
+       list_del_init(&tic->t_queue);
+       spin_unlock(&log->l_grant_reserve_lock);
        trace_xfs_log_grant_error(log, tic);
 
        /*
@@ -2638,7 +2597,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-       spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }      /* xlog_grant_log_space */
 
@@ -2646,17 +2604,14 @@ redo:
 /*
  * Replenish the byte reservation required by moving the grant write head.
  *
- *
+ * Similar to xlog_grant_log_space, the function is structured to have a lock
+ * free fast path.
  */
 STATIC int
 xlog_regrant_write_log_space(xlog_t       *log,
                             xlog_ticket_t *tic)
 {
        int             free_bytes, need_bytes;
-       xlog_ticket_t   *ntic;
-#ifdef DEBUG
-       xfs_lsn_t       tail_lsn;
-#endif
 
        tic->t_curr_res = tic->t_unit_res;
        xlog_tic_reset_res(tic);
@@ -2669,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t         *log,
                panic("regrant Recovery problem");
 #endif
 
-       spin_lock(&log->l_grant_lock);
-
        trace_xfs_log_regrant_write_enter(log, tic);
-
        if (XLOG_FORCED_SHUTDOWN(log))
-               goto error_return;
+               goto error_return_unlocked;
 
        /* If there are other waiters on the queue then give them a
         * chance at logspace before us. Wake up the first waiters,
@@ -2683,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t        *log,
         * this transaction.
         */
        need_bytes = tic->t_unit_res;
-       if ((ntic = log->l_write_headq)) {
-               free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
-                                            log->l_grant_write_bytes);
-               do {
+       if (!list_empty_careful(&log->l_writeq)) {
+               struct xlog_ticket *ntic;
+
+               spin_lock(&log->l_grant_write_lock);
+               free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+               list_for_each_entry(ntic, &log->l_writeq, t_queue) {
                        ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
 
                        if (free_bytes < ntic->t_unit_res)
                                break;
                        free_bytes -= ntic->t_unit_res;
-                       sv_signal(&ntic->t_wait);
-                       ntic = ntic->t_next;
-               } while (ntic != log->l_write_headq);
-
-               if (ntic != log->l_write_headq) {
-                       if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-                               xlog_ins_ticketq(&log->l_write_headq, tic);
+                       wake_up(&ntic->t_wait);
+               }
 
+               if (ntic != list_first_entry(&log->l_writeq,
+                                               struct xlog_ticket, t_queue)) {
+                       if (list_empty(&tic->t_queue))
+                               list_add_tail(&tic->t_queue, &log->l_writeq);
                        trace_xfs_log_regrant_write_sleep1(log, tic);
 
-                       spin_unlock(&log->l_grant_lock);
-                       xlog_grant_push_ail(log->l_mp, need_bytes);
-                       spin_lock(&log->l_grant_lock);
+                       xlog_grant_push_ail(log, need_bytes);
 
                        XFS_STATS_INC(xs_sleep_logspace);
-                       sv_wait(&tic->t_wait, PINOD|PLTWAIT,
-                               &log->l_grant_lock, s);
-
-                       /* If we're shutting down, this tic is already
-                        * off the queue */
-                       spin_lock(&log->l_grant_lock);
-                       if (XLOG_FORCED_SHUTDOWN(log))
-                               goto error_return;
-
+                       xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
                        trace_xfs_log_regrant_write_wake1(log, tic);
-               }
+               } else
+                       spin_unlock(&log->l_grant_write_lock);
        }
 
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-               goto error_return;
+               goto error_return_unlocked;
 
-       free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
-                                    log->l_grant_write_bytes);
+       free_bytes = xlog_space_left(log, &log->l_grant_write_head);
        if (free_bytes < need_bytes) {
-               if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-                       xlog_ins_ticketq(&log->l_write_headq, tic);
-               spin_unlock(&log->l_grant_lock);
-               xlog_grant_push_ail(log->l_mp, need_bytes);
-               spin_lock(&log->l_grant_lock);
-
-               XFS_STATS_INC(xs_sleep_logspace);
-               trace_xfs_log_regrant_write_sleep2(log, tic);
-
-               sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+               spin_lock(&log->l_grant_write_lock);
+               if (list_empty(&tic->t_queue))
+                       list_add_tail(&tic->t_queue, &log->l_writeq);
 
-               /* If we're shutting down, this tic is already off the queue */
-               spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
 
+               xlog_grant_push_ail(log, need_bytes);
+
+               XFS_STATS_INC(xs_sleep_logspace);
+               trace_xfs_log_regrant_write_sleep2(log, tic);
+               xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+
                trace_xfs_log_regrant_write_wake2(log, tic);
                goto redo;
-       } else if (tic->t_flags & XLOG_TIC_IN_Q)
-               xlog_del_ticketq(&log->l_write_headq, tic);
+       }
 
-       /* we've got enough space */
-       xlog_grant_add_space_write(log, need_bytes);
-#ifdef DEBUG
-       tail_lsn = log->l_tail_lsn;
-       if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-               ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-               ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
+       if (!list_empty(&tic->t_queue)) {
+               spin_lock(&log->l_grant_write_lock);
+               list_del_init(&tic->t_queue);
+               spin_unlock(&log->l_grant_write_lock);
        }
-#endif
 
+       /* we've got enough space */
+       xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_regrant_write_exit(log, tic);
-
-       xlog_verify_grant_head(log, 1);
-       spin_unlock(&log->l_grant_lock);
+       xlog_verify_grant_tail(log);
        return 0;
 
 
+ error_return_unlocked:
+       spin_lock(&log->l_grant_write_lock);
  error_return:
-       if (tic->t_flags & XLOG_TIC_IN_Q)
-               xlog_del_ticketq(&log->l_reserve_headq, tic);
-
+       list_del_init(&tic->t_queue);
+       spin_unlock(&log->l_grant_write_lock);
        trace_xfs_log_regrant_write_error(log, tic);
 
        /*
@@ -2778,7 +2714,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-       spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }      /* xlog_regrant_write_log_space */
 
@@ -2799,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t        *log,
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
 
-       spin_lock(&log->l_grant_lock);
-       xlog_grant_sub_space(log, ticket->t_curr_res);
+       xlog_grant_sub_space(log, &log->l_grant_reserve_head,
+                                       ticket->t_curr_res);
+       xlog_grant_sub_space(log, &log->l_grant_write_head,
+                                       ticket->t_curr_res);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
 
        trace_xfs_log_regrant_reserve_sub(log, ticket);
 
-       xlog_verify_grant_head(log, 1);
-
        /* just return if we still have some of the pre-reserved space */
-       if (ticket->t_cnt > 0) {
-               spin_unlock(&log->l_grant_lock);
+       if (ticket->t_cnt > 0)
                return;
-       }
 
-       xlog_grant_add_space_reserve(log, ticket->t_unit_res);
+       xlog_grant_add_space(log, &log->l_grant_reserve_head,
+                                       ticket->t_unit_res);
 
        trace_xfs_log_regrant_reserve_exit(log, ticket);
 
-       xlog_verify_grant_head(log, 0);
-       spin_unlock(&log->l_grant_lock);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
 }      /* xlog_regrant_reserve_log_space */
@@ -2843,28 +2775,29 @@ STATIC void
 xlog_ungrant_log_space(xlog_t       *log,
                       xlog_ticket_t *ticket)
 {
+       int     bytes;
+
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
 
-       spin_lock(&log->l_grant_lock);
        trace_xfs_log_ungrant_enter(log, ticket);
-
-       xlog_grant_sub_space(log, ticket->t_curr_res);
-
        trace_xfs_log_ungrant_sub(log, ticket);
 
-       /* If this is a permanent reservation ticket, we may be able to free
+       /*
+        * If this is a permanent reservation ticket, we may be able to free
         * up more space based on the remaining count.
         */
+       bytes = ticket->t_curr_res;
        if (ticket->t_cnt > 0) {
                ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
-               xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
+               bytes += ticket->t_unit_res*ticket->t_cnt;
        }
 
+       xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
+       xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
+
        trace_xfs_log_ungrant_exit(log, ticket);
 
-       xlog_verify_grant_head(log, 1);
-       spin_unlock(&log->l_grant_lock);
        xfs_log_move_tail(log->l_mp, 1);
 }      /* xlog_ungrant_log_space */
 
@@ -2901,11 +2834,11 @@ xlog_state_release_iclog(
 
        if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
                /* update tail before writing to iclog */
-               xlog_assign_tail_lsn(log->l_mp);
+               xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
                sync++;
                iclog->ic_state = XLOG_STATE_SYNCING;
-               iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
-               xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
+               iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+               xlog_verify_tail_lsn(log, iclog, tail_lsn);
                /* cycle incremented when incrementing curr_block */
        }
        spin_unlock(&log->l_icloglock);
@@ -3088,7 +3021,7 @@ maybe_sleep:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-               sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
+               xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3206,8 +3139,8 @@ try_again:
 
                                XFS_STATS_INC(xs_log_force_sleep);
 
-                               sv_wait(&iclog->ic_prev->ic_write_wait,
-                                       PSWP, &log->l_icloglock, s);
+                               xlog_wait(&iclog->ic_prev->ic_write_wait,
+                                                       &log->l_icloglock);
                                if (log_flushed)
                                        *log_flushed = 1;
                                already_slept = 1;
@@ -3235,7 +3168,7 @@ try_again:
                                return XFS_ERROR(EIO);
                        }
                        XFS_STATS_INC(xs_log_force_sleep);
-                       sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
+                       xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                        /*
                         * No need to grab the log lock here since we're
                         * only deciding whether or not to return EIO
@@ -3310,10 +3243,8 @@ xfs_log_ticket_put(
        xlog_ticket_t   *ticket)
 {
        ASSERT(atomic_read(&ticket->t_ref) > 0);
-       if (atomic_dec_and_test(&ticket->t_ref)) {
-               sv_destroy(&ticket->t_wait);
+       if (atomic_dec_and_test(&ticket->t_ref))
                kmem_zone_free(xfs_log_ticket_zone, ticket);
-       }
 }
 
 xlog_ticket_t *
@@ -3435,6 +3366,7 @@ xlog_ticket_alloc(
         }
 
        atomic_set(&tic->t_ref, 1);
+       INIT_LIST_HEAD(&tic->t_queue);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
@@ -3445,7 +3377,7 @@ xlog_ticket_alloc(
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-       sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
+       init_waitqueue_head(&tic->t_wait);
 
        xlog_tic_reset_res(tic);
 
@@ -3484,18 +3416,25 @@ xlog_verify_dest_ptr(
 }
 
 STATIC void
-xlog_verify_grant_head(xlog_t *log, int equals)
+xlog_verify_grant_tail(
+       struct log      *log)
 {
-    if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
-       if (equals)
-           ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
-       else
-           ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
-    } else {
-       ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
-       ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
-    }
-}      /* xlog_verify_grant_head */
+       int             tail_cycle, tail_blocks;
+       int             cycle, space;
+
+       /*
+        * Check to make sure the grant write head didn't just over lap the
+        * tail.  If the cycles are the same, we can't be overlapping.
+        * Otherwise, make sure that the cycles differ by exactly one and
+        * check the byte count.
+        */
+       xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+       xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
+       if (tail_cycle != cycle) {
+               ASSERT(cycle - 1 == tail_cycle);
+               ASSERT(space <= BBTOB(tail_blocks));
+       }
+}
 
 /* check if it will fit */
 STATIC void
@@ -3716,12 +3655,10 @@ xfs_log_force_umount(
                xlog_cil_force(log);
 
        /*
-        * We must hold both the GRANT lock and the LOG lock,
-        * before we mark the filesystem SHUTDOWN and wake
-        * everybody up to tell the bad news.
+        * mark the filesystem and the as in a shutdown state and wake
+        * everybody up to tell them the bad news.
         */
        spin_lock(&log->l_icloglock);
-       spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
        if (mp->m_sb_bp)
                XFS_BUF_DONE(mp->m_sb_bp);
@@ -3742,27 +3679,21 @@ xfs_log_force_umount(
        spin_unlock(&log->l_icloglock);
 
        /*
-        * We don't want anybody waiting for log reservations
-        * after this. That means we have to wake up everybody
-        * queued up on reserve_headq as well as write_headq.
-        * In addition, we make sure in xlog_{re}grant_log_space
-        * that we don't enqueue anything once the SHUTDOWN flag
-        * is set, and this action is protected by the GRANTLOCK.
+        * We don't want anybody waiting for log reservations after this. That
+        * means we have to wake up everybody queued up on reserveq as well as
+        * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
+        * we don't enqueue anything once the SHUTDOWN flag is set, and this
+        * action is protected by the grant locks.
         */
-       if ((tic = log->l_reserve_headq)) {
-               do {
-                       sv_signal(&tic->t_wait);
-                       tic = tic->t_next;
-               } while (tic != log->l_reserve_headq);
-       }
-
-       if ((tic = log->l_write_headq)) {
-               do {
-                       sv_signal(&tic->t_wait);
-                       tic = tic->t_next;
-               } while (tic != log->l_write_headq);
-       }
-       spin_unlock(&log->l_grant_lock);
+       spin_lock(&log->l_grant_reserve_lock);
+       list_for_each_entry(tic, &log->l_reserveq, t_queue)
+               wake_up(&tic->t_wait);
+       spin_unlock(&log->l_grant_reserve_lock);
+
+       spin_lock(&log->l_grant_write_lock);
+       list_for_each_entry(tic, &log->l_writeq, t_queue)
+               wake_up(&tic->t_wait);
+       spin_unlock(&log->l_grant_write_lock);
 
        if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
                ASSERT(!logerror);
index 23d6ceb..9dc8125 100644 (file)
@@ -61,7 +61,7 @@ xlog_cil_init(
        INIT_LIST_HEAD(&cil->xc_committing);
        spin_lock_init(&cil->xc_cil_lock);
        init_rwsem(&cil->xc_ctx_lock);
-       sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+       init_waitqueue_head(&cil->xc_commit_wait);
 
        INIT_LIST_HEAD(&ctx->committing);
        INIT_LIST_HEAD(&ctx->busy_extents);
@@ -361,15 +361,10 @@ xlog_cil_committed(
        int     abort)
 {
        struct xfs_cil_ctx      *ctx = args;
-       struct xfs_log_vec      *lv;
-       int                     abortflag = abort ? XFS_LI_ABORTED : 0;
        struct xfs_busy_extent  *busyp, *n;
 
-       /* unpin all the log items */
-       for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
-               xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
-                                                       abortflag);
-       }
+       xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
+                                       ctx->start_lsn, abort);
 
        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -568,7 +563,7 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                       sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                       xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
        }
@@ -592,7 +587,7 @@ restart:
         */
        spin_lock(&cil->xc_cil_lock);
        ctx->commit_lsn = commit_lsn;
-       sv_broadcast(&cil->xc_commit_wait);
+       wake_up_all(&cil->xc_commit_wait);
        spin_unlock(&cil->xc_cil_lock);
 
        /* release the hounds! */
@@ -757,7 +752,7 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                       sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                       xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
                if (ctx->sequence != sequence)
index edcdfe0..d5f8be8 100644 (file)
@@ -21,7 +21,6 @@
 struct xfs_buf;
 struct log;
 struct xlog_ticket;
-struct xfs_buf_cancel;
 struct xfs_mount;
 
 /*
@@ -54,7 +53,6 @@ struct xfs_mount;
        BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
 
-
 static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
 {
        return ((xfs_lsn_t)cycle << 32) | block;
@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
  */
 #define XLOG_TIC_INITED                0x1     /* has been initialized */
 #define XLOG_TIC_PERM_RESERV   0x2     /* permanent reservation */
-#define XLOG_TIC_IN_Q          0x4
 
 #define XLOG_TIC_FLAGS \
        { XLOG_TIC_INITED,      "XLOG_TIC_INITED" }, \
-       { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \
-       { XLOG_TIC_IN_Q,        "XLOG_TIC_IN_Q" }
+       { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
 
 #endif /* __KERNEL__ */
 
@@ -244,9 +240,8 @@ typedef struct xlog_res {
 } xlog_res_t;
 
 typedef struct xlog_ticket {
-       sv_t               t_wait;       /* ticket wait queue            : 20 */
-       struct xlog_ticket *t_next;      /*                              :4|8 */
-       struct xlog_ticket *t_prev;      /*                              :4|8 */
+       wait_queue_head_t  t_wait;       /* ticket wait queue */
+       struct list_head   t_queue;      /* reserve/write queue */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
        atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
  * and move everything else out to subsequent cachelines.
  */
 typedef struct xlog_in_core {
-       sv_t                    ic_force_wait;
-       sv_t                    ic_write_wait;
+       wait_queue_head_t       ic_force_wait;
+       wait_queue_head_t       ic_write_wait;
        struct xlog_in_core     *ic_next;
        struct xlog_in_core     *ic_prev;
        struct xfs_buf          *ic_bp;
@@ -421,7 +416,7 @@ struct xfs_cil {
        struct xfs_cil_ctx      *xc_ctx;
        struct rw_semaphore     xc_ctx_lock;
        struct list_head        xc_committing;
-       sv_t                    xc_commit_wait;
+       wait_queue_head_t       xc_commit_wait;
        xfs_lsn_t               xc_current_sequence;
 };
 
@@ -491,7 +486,7 @@ typedef struct log {
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
        uint                    l_flags;
        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
-       struct xfs_buf_cancel   **l_buf_cancel_table;
+       struct list_head        *l_buf_cancel_table;
        int                     l_iclog_hsize;  /* size of iclog header */
        int                     l_iclog_heads;  /* # of iclog header sectors */
        uint                    l_sectBBsize;   /* sector size in BBs (2^n) */
@@ -503,29 +498,40 @@ typedef struct log {
        int                     l_logBBsize;    /* size of log in BB chunks */
 
        /* The following block of fields are changed while holding icloglock */
-       sv_t                    l_flush_wait ____cacheline_aligned_in_smp;
+       wait_queue_head_t       l_flush_wait ____cacheline_aligned_in_smp;
                                                /* waiting for iclog flush */
        int                     l_covered_state;/* state of "covering disk
                                                 * log entries" */
        xlog_in_core_t          *l_iclog;       /* head log queue       */
        spinlock_t              l_icloglock;    /* grab to change iclog state */
-       xfs_lsn_t               l_tail_lsn;     /* lsn of 1st LR with unflushed
-                                                * buffers */
-       xfs_lsn_t               l_last_sync_lsn;/* lsn of last LR on disk */
        int                     l_curr_cycle;   /* Cycle number of log writes */
        int                     l_prev_cycle;   /* Cycle number before last
                                                 * block increment */
        int                     l_curr_block;   /* current logical log block */
        int                     l_prev_block;   /* previous logical log block */
 
-       /* The following block of fields are changed while holding grant_lock */
-       spinlock_t              l_grant_lock ____cacheline_aligned_in_smp;
-       xlog_ticket_t           *l_reserve_headq;
-       xlog_ticket_t           *l_write_headq;
-       int                     l_grant_reserve_cycle;
-       int                     l_grant_reserve_bytes;
-       int                     l_grant_write_cycle;
-       int                     l_grant_write_bytes;
+       /*
+        * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
+        * read without needing to hold specific locks. To avoid operations
+        * contending with other hot objects, place each of them on a separate
+        * cacheline.
+        */
+       /* lsn of last LR on disk */
+       atomic64_t              l_last_sync_lsn ____cacheline_aligned_in_smp;
+       /* lsn of 1st LR with unflushed * buffers */
+       atomic64_t              l_tail_lsn ____cacheline_aligned_in_smp;
+
+       /*
+        * ticket grant locks, queues and accounting have their own cachlines
+        * as these are quite hot and can be operated on concurrently.
+        */
+       spinlock_t              l_grant_reserve_lock ____cacheline_aligned_in_smp;
+       struct list_head        l_reserveq;
+       atomic64_t              l_grant_reserve_head;
+
+       spinlock_t              l_grant_write_lock ____cacheline_aligned_in_smp;
+       struct list_head        l_writeq;
+       atomic64_t              l_grant_write_head;
 
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
@@ -534,6 +540,9 @@ typedef struct log {
 
 } xlog_t;
 
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+       ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
+
 #define XLOG_FORCED_SHUTDOWN(log)      ((log)->l_flags & XLOG_IO_ERROR)
 
 /* common routines */
@@ -561,6 +570,61 @@ int        xlog_write(struct log *log, struct xfs_log_vec *log_vector,
                                struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
                                xlog_in_core_t **commit_iclog, uint flags);
 
+/*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to smaple and crack LSNs taht are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+       xfs_lsn_t val = atomic64_read(lsn);
+
+       *cycle = CYCLE_LSN(val);
+       *block = BLOCK_LSN(val);
+}
+
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+       atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+       *cycle = val >> 32;
+       *space = val & 0xffffffff;
+}
+
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+       xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+       return ((int64_t)cycle << 32) | space;
+}
+
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+       atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+
 /*
  * Committed Item List interfaces
  */
@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
  */
 #define XLOG_UNMOUNT_REC_TYPE  (-1U)
 
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+{
+       DECLARE_WAITQUEUE(wait, current);
+
+       add_wait_queue_exclusive(wq, &wait);
+       __set_current_state(TASK_UNINTERRUPTIBLE);
+       spin_unlock(lock);
+       schedule();
+       remove_wait_queue(wq, &wait);
+}
 #endif /* __KERNEL__ */
 
 #endif /* __XFS_LOG_PRIV_H__ */
index 966d3f9..204d8e5 100644 (file)
@@ -52,6 +52,17 @@ STATIC void  xlog_recover_check_summary(xlog_t *);
 #define        xlog_recover_check_summary(log)
 #endif
 
+/*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+       xfs_daddr_t             bc_blkno;
+       uint                    bc_len;
+       int                     bc_refcount;
+       struct list_head        bc_list;
+};
+
 /*
  * Sector aligned buffer routines for buffer create/read/write/access
  */
@@ -925,12 +936,12 @@ xlog_find_tail(
        log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
        if (found == 2)
                log->l_curr_cycle++;
-       log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
-       log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
-       log->l_grant_reserve_cycle = log->l_curr_cycle;
-       log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
-       log->l_grant_write_cycle = log->l_curr_cycle;
-       log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+       atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+       atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+       xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
+                                       BBTOB(log->l_curr_block));
+       xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
+                                       BBTOB(log->l_curr_block));
 
        /*
         * Look for unmount record.  If we find it, then we know there
@@ -960,7 +971,7 @@ xlog_find_tail(
        }
        after_umount_blk = (i + hblks + (int)
                BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
-       tail_lsn = log->l_tail_lsn;
+       tail_lsn = atomic64_read(&log->l_tail_lsn);
        if (*head_blk == after_umount_blk &&
            be32_to_cpu(rhead->h_num_logops) == 1) {
                umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -975,12 +986,10 @@ xlog_find_tail(
                         * log records will point recovery to after the
                         * current unmount record.
                         */
-                       log->l_tail_lsn =
-                               xlog_assign_lsn(log->l_curr_cycle,
-                                               after_umount_blk);
-                       log->l_last_sync_lsn =
-                               xlog_assign_lsn(log->l_curr_cycle,
-                                               after_umount_blk);
+                       xlog_assign_atomic_lsn(&log->l_tail_lsn,
+                                       log->l_curr_cycle, after_umount_blk);
+                       xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+                                       log->l_curr_cycle, after_umount_blk);
                        *tail_blk = after_umount_blk;
 
                        /*
@@ -1605,82 +1614,45 @@ xlog_recover_reorder_trans(
  * record in the table to tell us how many times we expect to see this
  * record during the second pass.
  */
-STATIC void
-xlog_recover_do_buffer_pass1(
-       xlog_t                  *log,
-       xfs_buf_log_format_t    *buf_f)
+STATIC int
+xlog_recover_buffer_pass1(
+       struct log              *log,
+       xlog_recover_item_t     *item)
 {
-       xfs_buf_cancel_t        *bcp;
-       xfs_buf_cancel_t        *nextp;
-       xfs_buf_cancel_t        *prevp;
-       xfs_buf_cancel_t        **bucket;
-       xfs_daddr_t             blkno = 0;
-       uint                    len = 0;
-       ushort                  flags = 0;
-
-       switch (buf_f->blf_type) {
-       case XFS_LI_BUF:
-               blkno = buf_f->blf_blkno;
-               len = buf_f->blf_len;
-               flags = buf_f->blf_flags;
-               break;
-       }
+       xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
+       struct list_head        *bucket;
+       struct xfs_buf_cancel   *bcp;
 
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-       if (!(flags & XFS_BLF_CANCEL)) {
+       if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
-               return;
-       }
-
-       /*
-        * Insert an xfs_buf_cancel record into the hash table of
-        * them.  If there is already an identical record, bump
-        * its reference count.
-        */
-       bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                         XLOG_BC_TABLE_SIZE];
-       /*
-        * If the hash bucket is empty then just insert a new record into
-        * the bucket.
-        */
-       if (*bucket == NULL) {
-               bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
-                                                    KM_SLEEP);
-               bcp->bc_blkno = blkno;
-               bcp->bc_len = len;
-               bcp->bc_refcount = 1;
-               bcp->bc_next = NULL;
-               *bucket = bcp;
-               return;
+               return 0;
        }
 
        /*
-        * The hash bucket is not empty, so search for duplicates of our
-        * record.  If we find one them just bump its refcount.  If not
-        * then add us at the end of the list.
+        * Insert an xfs_buf_cancel record into the hash table of them.
+        * If there is already an identical record, bump its reference count.
         */
-       prevp = NULL;
-       nextp = *bucket;
-       while (nextp != NULL) {
-               if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
-                       nextp->bc_refcount++;
+       bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
+       list_for_each_entry(bcp, bucket, bc_list) {
+               if (bcp->bc_blkno == buf_f->blf_blkno &&
+                   bcp->bc_len == buf_f->blf_len) {
+                       bcp->bc_refcount++;
                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
-                       return;
+                       return 0;
                }
-               prevp = nextp;
-               nextp = nextp->bc_next;
-       }
-       ASSERT(prevp != NULL);
-       bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
-                                            KM_SLEEP);
-       bcp->bc_blkno = blkno;
-       bcp->bc_len = len;
+       }
+
+       bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
+       bcp->bc_blkno = buf_f->blf_blkno;
+       bcp->bc_len = buf_f->blf_len;
        bcp->bc_refcount = 1;
-       bcp->bc_next = NULL;
-       prevp->bc_next = bcp;
+       list_add_tail(&bcp->bc_list, bucket);
+
        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+       return 0;
 }
 
 /*
@@ -1698,14 +1670,13 @@ xlog_recover_do_buffer_pass1(
  */
 STATIC int
 xlog_check_buffer_cancelled(
-       xlog_t                  *log,
+       struct log              *log,
        xfs_daddr_t             blkno,
        uint                    len,
        ushort                  flags)
 {
-       xfs_buf_cancel_t        *bcp;
-       xfs_buf_cancel_t        *prevp;
-       xfs_buf_cancel_t        **bucket;
+       struct list_head        *bucket;
+       struct xfs_buf_cancel   *bcp;
 
        if (log->l_buf_cancel_table == NULL) {
                /*
@@ -1716,128 +1687,70 @@ xlog_check_buffer_cancelled(
                return 0;
        }
 
-       bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                         XLOG_BC_TABLE_SIZE];
-       bcp = *bucket;
-       if (bcp == NULL) {
-               /*
-                * There is no corresponding entry in the table built
-                * in pass one, so this buffer has not been cancelled.
-                */
-               ASSERT(!(flags & XFS_BLF_CANCEL));
-               return 0;
-       }
-
        /*
-        * Search for an entry in the buffer cancel table that
-        * matches our buffer.
+        * Search for an entry in the  cancel table that matches our buffer.
         */
-       prevp = NULL;
-       while (bcp != NULL) {
-               if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
-                       /*
-                        * We've go a match, so return 1 so that the
-                        * recovery of this buffer is cancelled.
-                        * If this buffer is actually a buffer cancel
-                        * log item, then decrement the refcount on the
-                        * one in the table and remove it if this is the
-                        * last reference.
-                        */
-                       if (flags & XFS_BLF_CANCEL) {
-                               bcp->bc_refcount--;
-                               if (bcp->bc_refcount == 0) {
-                                       if (prevp == NULL) {
-                                               *bucket = bcp->bc_next;
-                                       } else {
-                                               prevp->bc_next = bcp->bc_next;
-                                       }
-                                       kmem_free(bcp);
-                               }
-                       }
-                       return 1;
-               }
-               prevp = bcp;
-               bcp = bcp->bc_next;
+       bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
+       list_for_each_entry(bcp, bucket, bc_list) {
+               if (bcp->bc_blkno == blkno && bcp->bc_len == len)
+                       goto found;
        }
+
        /*
-        * We didn't find a corresponding entry in the table, so
-        * return 0 so that the buffer is NOT cancelled.
+        * We didn't find a corresponding entry in the table, so return 0 so
+        * that the buffer is NOT cancelled.
         */
        ASSERT(!(flags & XFS_BLF_CANCEL));
        return 0;
-}
 
-STATIC int
-xlog_recover_do_buffer_pass2(
-       xlog_t                  *log,
-       xfs_buf_log_format_t    *buf_f)
-{
-       xfs_daddr_t             blkno = 0;
-       ushort                  flags = 0;
-       uint                    len = 0;
-
-       switch (buf_f->blf_type) {
-       case XFS_LI_BUF:
-               blkno = buf_f->blf_blkno;
-               flags = buf_f->blf_flags;
-               len = buf_f->blf_len;
-               break;
+found:
+       /*
+        * We've go a match, so return 1 so that the recovery of this buffer
+        * is cancelled.  If this buffer is actually a buffer cancel log
+        * item, then decrement the refcount on the one in the table and
+        * remove it if this is the last reference.
+        */
+       if (flags & XFS_BLF_CANCEL) {
+               if (--bcp->bc_refcount == 0) {
+                       list_del(&bcp->bc_list);
+                       kmem_free(bcp);
+               }
        }
-
-       return xlog_check_buffer_cancelled(log, blkno, len, flags);
+       return 1;
 }
 
 /*
- * Perform recovery for a buffer full of inodes.  In these buffers,
- * the only data which should be recovered is that which corresponds
- * to the di_next_unlinked pointers in the on disk inode structures.
- * The rest of the data for the inodes is always logged through the
- * inodes themselves rather than the inode buffer and is recovered
- * in xlog_recover_do_inode_trans().
+ * Perform recovery for a buffer full of inodes.  In these buffers, the only
+ * data which should be recovered is that which corresponds to the
+ * di_next_unlinked pointers in the on disk inode structures.  The rest of the
+ * data for the inodes is always logged through the inodes themselves rather
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
  *
- * The only time when buffers full of inodes are fully recovered is
- * when the buffer is full of newly allocated inodes.  In this case
- * the buffer will not be marked as an inode buffer and so will be
- * sent to xlog_recover_do_reg_buffer() below during recovery.
+ * The only time when buffers full of inodes are fully recovered is when the
+ * buffer is full of newly allocated inodes.  In this case the buffer will
+ * not be marked as an inode buffer and so will be sent to
+ * xlog_recover_do_reg_buffer() below during recovery.
  */
 STATIC int
 xlog_recover_do_inode_buffer(
-       xfs_mount_t             *mp,
+       struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-       xfs_buf_t               *bp,
+       struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
-       int                     item_index;
-       int                     bit;
-       int                     nbits;
-       int                     reg_buf_offset;
-       int                     reg_buf_bytes;
+       int                     item_index = 0;
+       int                     bit = 0;
+       int                     nbits = 0;
+       int                     reg_buf_offset = 0;
+       int                     reg_buf_bytes = 0;
        int                     next_unlinked_offset;
        int                     inodes_per_buf;
        xfs_agino_t             *logged_nextp;
        xfs_agino_t             *buffer_nextp;
-       unsigned int            *data_map = NULL;
-       unsigned int            map_size = 0;
 
        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
 
-       switch (buf_f->blf_type) {
-       case XFS_LI_BUF:
-               data_map = buf_f->blf_data_map;
-               map_size = buf_f->blf_map_size;
-               break;
-       }
-       /*
-        * Set the variables corresponding to the current region to
-        * 0 so that we'll initialize them on the first pass through
-        * the loop.
-        */
-       reg_buf_offset = 0;
-       reg_buf_bytes = 0;
-       bit = 0;
-       nbits = 0;
-       item_index = 0;
        inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
        for (i = 0; i < inodes_per_buf; i++) {
                next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,18 +1765,18 @@ xlog_recover_do_inode_buffer(
                         * the current di_next_unlinked field.
                         */
                        bit += nbits;
-                       bit = xfs_next_bit(data_map, map_size, bit);
+                       bit = xfs_next_bit(buf_f->blf_data_map,
+                                          buf_f->blf_map_size, bit);
 
                        /*
                         * If there are no more logged regions in the
                         * buffer, then we're done.
                         */
-                       if (bit == -1) {
+                       if (bit == -1)
                                return 0;
-                       }
 
-                       nbits = xfs_contig_bits(data_map, map_size,
-                                                        bit);
+                       nbits = xfs_contig_bits(buf_f->blf_data_map,
+                                               buf_f->blf_map_size, bit);
                        ASSERT(nbits > 0);
                        reg_buf_offset = bit << XFS_BLF_SHIFT;
                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1875,9 +1788,8 @@ xlog_recover_do_inode_buffer(
                 * di_next_unlinked field, then move on to the next
                 * di_next_unlinked field.
                 */
-               if (next_unlinked_offset < reg_buf_offset) {
+               if (next_unlinked_offset < reg_buf_offset)
                        continue;
-               }
 
                ASSERT(item->ri_buf[item_index].i_addr != NULL);
                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1913,36 +1825,29 @@ xlog_recover_do_inode_buffer(
  * given buffer.  The bitmap in the buf log format structure indicates
  * where to place the logged data.
  */
-/*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-       xfs_buf_t               *bp,
+       struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
        int                     bit;
        int                     nbits;
-       unsigned int            *data_map = NULL;
-       unsigned int            map_size = 0;
        int                     error;
 
        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
 
-       switch (buf_f->blf_type) {
-       case XFS_LI_BUF:
-               data_map = buf_f->blf_data_map;
-               map_size = buf_f->blf_map_size;
-               break;
-       }
        bit = 0;
        i = 1;  /* 0 is the buf format structure */
        while (1) {
-               bit = xfs_next_bit(data_map, map_size, bit);
+               bit = xfs_next_bit(buf_f->blf_data_map,
+                                  buf_f->blf_map_size, bit);
                if (bit == -1)
                        break;
-               nbits = xfs_contig_bits(data_map, map_size, bit);
+               nbits = xfs_contig_bits(buf_f->blf_data_map,
+                                       buf_f->blf_map_size, bit);
                ASSERT(nbits > 0);
                ASSERT(item->ri_buf[i].i_addr != NULL);
                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -2176,77 +2081,46 @@ xlog_recover_do_dquot_buffer(
  * for more details on the implementation of the table of cancel records.
  */
 STATIC int
-xlog_recover_do_buffer_trans(
+xlog_recover_buffer_pass2(
        xlog_t                  *log,
-       xlog_recover_item_t     *item,
-       int                     pass)
+       xlog_recover_item_t     *item)
 {
        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-       xfs_mount_t             *mp;
+       xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        int                     error;
-       int                     cancel;
-       xfs_daddr_t             blkno;
-       int                     len;
-       ushort                  flags;
        uint                    buf_flags;
 
-       if (pass == XLOG_RECOVER_PASS1) {
-               /*
-                * In this pass we're only looking for buf items
-                * with the XFS_BLF_CANCEL bit set.
-                */
-               xlog_recover_do_buffer_pass1(log, buf_f);
+       /*
+        * In this pass we only want to recover all the buffers which have
+        * not been cancelled and are not cancellation buffers themselves.
+        */
+       if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
+                       buf_f->blf_len, buf_f->blf_flags)) {
+               trace_xfs_log_recover_buf_cancel(log, buf_f);
                return 0;
-       } else {
-               /*
-                * In this pass we want to recover all the buffers
-                * which have not been cancelled and are not
-                * cancellation buffers themselves.  The routine
-                * we call here will tell us whether or not to
-                * continue with the replay of this buffer.
-                */
-               cancel = xlog_recover_do_buffer_pass2(log, buf_f);
-               if (cancel) {
-                       trace_xfs_log_recover_buf_cancel(log, buf_f);
-                       return 0;
-               }
        }
+
        trace_xfs_log_recover_buf_recover(log, buf_f);
-       switch (buf_f->blf_type) {
-       case XFS_LI_BUF:
-               blkno = buf_f->blf_blkno;
-               len = buf_f->blf_len;
-               flags = buf_f->blf_flags;
-               break;
-       default:
-               xfs_fs_cmn_err(CE_ALERT, log->l_mp,
-                       "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
-                       buf_f->blf_type, log->l_mp->m_logname ?
-                       log->l_mp->m_logname : "internal");
-               XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
-                                XFS_ERRLEVEL_LOW, log->l_mp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
 
-       mp = log->l_mp;
        buf_flags = XBF_LOCK;
-       if (!(flags & XFS_BLF_INODE_BUF))
+       if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
                buf_flags |= XBF_MAPPED;
 
-       bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
+       bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+                         buf_flags);
        if (XFS_BUF_ISERROR(bp)) {
-               xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
-                                 bp, blkno);
+               xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
+                                 bp, buf_f->blf_blkno);
                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                return error;
        }
 
        error = 0;
-       if (flags & XFS_BLF_INODE_BUF) {
+       if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
-       } else if (flags &
+       } else if (buf_f->blf_flags &
                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
@@ -2286,16 +2160,14 @@ xlog_recover_do_buffer_trans(
 }
 
 STATIC int
-xlog_recover_do_inode_trans(
+xlog_recover_inode_pass2(
        xlog_t                  *log,
-       xlog_recover_item_t     *item,
-       int                     pass)
+       xlog_recover_item_t     *item)
 {
        xfs_inode_log_format_t  *in_f;
-       xfs_mount_t             *mp;
+       xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        xfs_dinode_t            *dip;
-       xfs_ino_t               ino;
        int                     len;
        xfs_caddr_t             src;
        xfs_caddr_t             dest;
@@ -2305,10 +2177,6 @@ xlog_recover_do_inode_trans(
        xfs_icdinode_t          *dicp;
        int                     need_free = 0;
 
-       if (pass == XLOG_RECOVER_PASS1) {
-               return 0;
-       }
-
        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
                in_f = item->ri_buf[0].i_addr;
        } else {
@@ -2318,8 +2186,6 @@ xlog_recover_do_inode_trans(
                if (error)
                        goto error;
        }
-       ino = in_f->ilf_ino;
-       mp = log->l_mp;
 
        /*
         * Inode buffers can be freed, look out for it,
@@ -2354,8 +2220,8 @@ xlog_recover_do_inode_trans(
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
-                       dip, bp, ino);
-               XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
+                       dip, bp, in_f->ilf_ino);
+               XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2365,8 +2231,8 @@ xlog_recover_do_inode_trans(
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
-                       item, ino);
-               XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
+                       item, in_f->ilf_ino);
+               XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2394,12 +2260,12 @@ xlog_recover_do_inode_trans(
        if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
-                       XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
+                       XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
                                         XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-                               item, dip, bp, ino);
+                               item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
@@ -2407,40 +2273,40 @@ xlog_recover_do_inode_trans(
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
-                       XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
+                       XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
                                             XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-                               item, dip, bp, ino);
+                               item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
        }
        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
-               XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
+               XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
-                       item, dip, bp, ino,
+                       item, dip, bp, in_f->ilf_ino,
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
-               XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
+               XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
-                       item, dip, bp, ino, dicp->di_forkoff);
+                       item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
-               XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
+               XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2532,7 +2398,7 @@ xlog_recover_do_inode_trans(
                        break;
 
                default:
-                       xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
+                       xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
                        ASSERT(0);
                        xfs_buf_relse(bp);
                        error = EIO;
@@ -2556,18 +2422,11 @@ error:
  * of that type.
  */
 STATIC int
-xlog_recover_do_quotaoff_trans(
+xlog_recover_quotaoff_pass1(
        xlog_t                  *log,
-       xlog_recover_item_t     *item,
-       int                     pass)
+       xlog_recover_item_t     *item)
 {
-       xfs_qoff_logformat_t    *qoff_f;
-
-       if (pass == XLOG_RECOVER_PASS2) {
-               return (0);
-       }
-
-       qoff_f = item->ri_buf[0].i_addr;
+       xfs_qoff_logformat_t    *qoff_f = item->ri_buf[0].i_addr;
        ASSERT(qoff_f);
 
        /*
@@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans(
  * Recover a dquot record
  */
 STATIC int
-xlog_recover_do_dquot_trans(
+xlog_recover_dquot_pass2(
        xlog_t                  *log,
-       xlog_recover_item_t     *item,
-       int                     pass)
+       xlog_recover_item_t     *item)
 {
-       xfs_mount_t             *mp;
+       xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        struct xfs_disk_dquot   *ddq, *recddq;
        int                     error;
        xfs_dq_logformat_t      *dq_f;
        uint                    type;
 
-       if (pass == XLOG_RECOVER_PASS1) {
-               return 0;
-       }
-       mp = log->l_mp;
 
        /*
         * Filesystems are required to send in quota flags at mount time.
@@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans(
        if ((error = xfs_qm_dqcheck(recddq,
                           dq_f->qlf_id,
                           0, XFS_QMOPT_DOWARN,
-                          "xlog_recover_do_dquot_trans (log copy)"))) {
+                          "xlog_recover_dquot_pass2 (log copy)"))) {
                return XFS_ERROR(EIO);
        }
        ASSERT(dq_f->qlf_len == 1);
@@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans(
         * minimal initialization then.
         */
        if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                          "xlog_recover_do_dquot_trans")) {
+                          "xlog_recover_dquot_pass2")) {
                xfs_buf_relse(bp);
                return XFS_ERROR(EIO);
        }
@@ -2693,38 +2547,31 @@ xlog_recover_do_dquot_trans(
  * LSN.
  */
 STATIC int
-xlog_recover_do_efi_trans(
+xlog_recover_efi_pass2(
        xlog_t                  *log,
        xlog_recover_item_t     *item,
-       xfs_lsn_t               lsn,
-       int                     pass)
+       xfs_lsn_t               lsn)
 {
        int                     error;
-       xfs_mount_t             *mp;
+       xfs_mount_t             *mp = log->l_mp;
        xfs_efi_log_item_t      *efip;
        xfs_efi_log_format_t    *efi_formatp;
 
-       if (pass == XLOG_RECOVER_PASS1) {
-               return 0;
-       }
-
        efi_formatp = item->ri_buf[0].i_addr;
 
-       mp = log->l_mp;
        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
        if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
                                         &(efip->efi_format)))) {
                xfs_efi_item_free(efip);
                return error;
        }
-       efip->efi_next_extent = efi_formatp->efi_nextents;
-       efip->efi_flags |= XFS_EFI_COMMITTED;
+       atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
 
        spin_lock(&log->l_ailp->xa_lock);
        /*
         * xfs_trans_ail_update() drops the AIL lock.
         */
-       xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
+       xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
        return 0;
 }
 
@@ -2737,11 +2584,10 @@ xlog_recover_do_efi_trans(
  * efd format structure.  If we find it, we remove the efi from the
  * AIL and free it.
  */
-STATIC void
-xlog_recover_do_efd_trans(
+STATIC int
+xlog_recover_efd_pass2(
        xlog_t                  *log,
-       xlog_recover_item_t     *item,
-       int                     pass)
+       xlog_recover_item_t     *item)
 {
        xfs_efd_log_format_t    *efd_formatp;
        xfs_efi_log_item_t      *efip = NULL;
@@ -2750,10 +2596,6 @@ xlog_recover_do_efd_trans(
        struct xfs_ail_cursor   cur;
        struct xfs_ail          *ailp = log->l_ailp;
 
-       if (pass == XLOG_RECOVER_PASS1) {
-               return;
-       }
-
        efd_formatp = item->ri_buf[0].i_addr;
        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2785,62 +2627,6 @@ xlog_recover_do_efd_trans(
        }
        xfs_trans_ail_cursor_done(ailp, &cur);
        spin_unlock(&ailp->xa_lock);
-}
-
-/*
- * Perform the transaction
- *
- * If the transaction modifies a buffer or inode, do it now.  Otherwise,
- * EFIs and EFDs get queued up by adding entries into the AIL for them.
- */
-STATIC int
-xlog_recover_do_trans(
-       xlog_t                  *log,
-       xlog_recover_t          *trans,
-       int                     pass)
-{
-       int                     error = 0;
-       xlog_recover_item_t     *item;
-
-       error = xlog_recover_reorder_trans(log, trans, pass);
-       if (error)
-               return error;
-
-       list_for_each_entry(item, &trans->r_itemq, ri_list) {
-               trace_xfs_log_recover_item_recover(log, trans, item, pass);
-               switch (ITEM_TYPE(item)) {
-               case XFS_LI_BUF:
-                       error = xlog_recover_do_buffer_trans(log, item, pass);
-                       break;
-               case XFS_LI_INODE:
-                       error = xlog_recover_do_inode_trans(log, item, pass);
-                       break;
-               case XFS_LI_EFI:
-                       error = xlog_recover_do_efi_trans(log, item,
-                                                         trans->r_lsn, pass);
-                       break;
-               case XFS_LI_EFD:
-                       xlog_recover_do_efd_trans(log, item, pass);
-                       error = 0;
-                       break;
-               case XFS_LI_DQUOT:
-                       error = xlog_recover_do_dquot_trans(log, item, pass);
-                       break;
-               case XFS_LI_QUOTAOFF:
-                       error = xlog_recover_do_quotaoff_trans(log, item,
-                                                              pass);
-                       break;
-               default:
-                       xlog_warn(
-       "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
-                       ASSERT(0);
-                       error = XFS_ERROR(EIO);
-                       break;
-               }
-
-               if (error)
-                       return error;
-       }
 
        return 0;
 }
@@ -2852,7 +2638,7 @@ xlog_recover_do_trans(
  */
 STATIC void
 xlog_recover_free_trans(
-       xlog_recover_t          *trans)
+       struct xlog_recover     *trans)
 {
        xlog_recover_item_t     *item, *n;
        int                     i;
@@ -2870,18 +2656,96 @@ xlog_recover_free_trans(
        kmem_free(trans);
 }
 
+STATIC int
+xlog_recover_commit_pass1(
+       struct log              *log,
+       struct xlog_recover     *trans,
+       xlog_recover_item_t     *item)
+{
+       trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+
+       switch (ITEM_TYPE(item)) {
+       case XFS_LI_BUF:
+               return xlog_recover_buffer_pass1(log, item);
+       case XFS_LI_QUOTAOFF:
+               return xlog_recover_quotaoff_pass1(log, item);
+       case XFS_LI_INODE:
+       case XFS_LI_EFI:
+       case XFS_LI_EFD:
+       case XFS_LI_DQUOT:
+               /* nothing to do in pass 1 */
+               return 0;
+       default:
+               xlog_warn(
+       "XFS: invalid item type (%d) xlog_recover_commit_pass1",
+                       ITEM_TYPE(item));
+               ASSERT(0);
+               return XFS_ERROR(EIO);
+       }
+}
+
+STATIC int
+xlog_recover_commit_pass2(
+       struct log              *log,
+       struct xlog_recover     *trans,
+       xlog_recover_item_t     *item)
+{
+       trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+
+       switch (ITEM_TYPE(item)) {
+       case XFS_LI_BUF:
+               return xlog_recover_buffer_pass2(log, item);
+       case XFS_LI_INODE:
+               return xlog_recover_inode_pass2(log, item);
+       case XFS_LI_EFI:
+               return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+       case XFS_LI_EFD:
+               return xlog_recover_efd_pass2(log, item);
+       case XFS_LI_DQUOT:
+               return xlog_recover_dquot_pass2(log, item);
+       case XFS_LI_QUOTAOFF:
+               /* nothing to do in pass2 */
+               return 0;
+       default:
+               xlog_warn(
+       "XFS: invalid item type (%d) xlog_recover_commit_pass2",
+                       ITEM_TYPE(item));
+               ASSERT(0);
+               return XFS_ERROR(EIO);
+       }
+}
+
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
 STATIC int
 xlog_recover_commit_trans(
-       xlog_t                  *log,
-       xlog_recover_t          *trans,
+       struct log              *log,
+       struct xlog_recover     *trans,
        int                     pass)
 {
-       int                     error;
+       int                     error = 0;
+       xlog_recover_item_t     *item;
 
        hlist_del(&trans->r_list);
-       if ((error = xlog_recover_do_trans(log, trans, pass)))
+
+       error = xlog_recover_reorder_trans(log, trans, pass);
+       if (error)
                return error;
-       xlog_recover_free_trans(trans);                 /* no error */
+
+       list_for_each_entry(item, &trans->r_itemq, ri_list) {
+               if (pass == XLOG_RECOVER_PASS1)
+                       error = xlog_recover_commit_pass1(log, trans, item);
+               else
+                       error = xlog_recover_commit_pass2(log, trans, item);
+               if (error)
+                       return error;
+       }
+
+       xlog_recover_free_trans(trans);
        return 0;
 }
 
@@ -3011,7 +2875,7 @@ xlog_recover_process_efi(
        xfs_extent_t            *extp;
        xfs_fsblock_t           startblock_fsb;
 
-       ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+       ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
 
        /*
         * First check the validity of the extents described by the
@@ -3050,7 +2914,7 @@ xlog_recover_process_efi(
                                         extp->ext_len);
        }
 
-       efip->efi_flags |= XFS_EFI_RECOVERED;
+       set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
        error = xfs_trans_commit(tp, 0);
        return error;
 
@@ -3107,7 +2971,7 @@ xlog_recover_process_efis(
                 * Skip EFIs that we've already processed.
                 */
                efip = (xfs_efi_log_item_t *)lip;
-               if (efip->efi_flags & XFS_EFI_RECOVERED) {
+               if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
                        lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
                }
@@ -3724,7 +3588,7 @@ xlog_do_log_recovery(
        xfs_daddr_t     head_blk,
        xfs_daddr_t     tail_blk)
 {
-       int             error;
+       int             error, i;
 
        ASSERT(head_blk != tail_blk);
 
@@ -3732,10 +3596,12 @@ xlog_do_log_recovery(
         * First do a pass to find all of the cancelled buf log items.
         * Store them in the buf_cancel_table for use in the second pass.
         */
-       log->l_buf_cancel_table =
-               (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
-                                                sizeof(xfs_buf_cancel_t*),
+       log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
+                                                sizeof(struct list_head),
                                                 KM_SLEEP);
+       for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+               INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+
        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
                                      XLOG_RECOVER_PASS1);
        if (error != 0) {
@@ -3754,7 +3620,7 @@ xlog_do_log_recovery(
                int     i;
 
                for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-                       ASSERT(log->l_buf_cancel_table[i] == NULL);
+                       ASSERT(list_empty(&log->l_buf_cancel_table[i]));
        }
 #endif /* DEBUG */
 
index 19e9dfa..d447aef 100644 (file)
@@ -472,7 +472,7 @@ xfs_initialize_perag(
                        goto out_unwind;
                pag->pag_agno = index;
                pag->pag_mount = mp;
-               rwlock_init(&pag->pag_ici_lock);
+               spin_lock_init(&pag->pag_ici_lock);
                mutex_init(&pag->pag_ici_reclaim_lock);
                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
                spin_lock_init(&pag->pag_buf_lock);
@@ -974,6 +974,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
        mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
 }
 
+/*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+       struct xfs_mount        *mp)
+{
+       int i;
+
+       for (i = 0; i < XFS_LOWSP_MAX; i++) {
+               __uint64_t space = mp->m_sb.sb_dblocks;
+
+               do_div(space, 100);
+               mp->m_low_space[i] = space * (i + 1);
+       }
+}
+
+
 /*
  * Set whether we're using inode alignment.
  */
@@ -1196,6 +1214,9 @@ xfs_mountfs(
         */
        xfs_set_rw_sizes(mp);
 
+       /* set the low space thresholds for dynamic preallocation */
+       xfs_set_low_space_thresholds(mp);
+
        /*
         * Set the inode cluster size.
         * This may still be overridden by the file system
index 5861b49..a62e897 100644 (file)
@@ -103,6 +103,16 @@ extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
        xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif
 
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+       XFS_LOWSP_1_PCNT = 0,
+       XFS_LOWSP_2_PCNT,
+       XFS_LOWSP_3_PCNT,
+       XFS_LOWSP_4_PCNT,
+       XFS_LOWSP_5_PCNT,
+       XFS_LOWSP_MAX,
+};
+
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
@@ -202,6 +212,8 @@ typedef struct xfs_mount {
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
+       int64_t                 m_low_space[XFS_LOWSP_MAX];
+                                               /* low free space thresholds */
 } xfs_mount_t;
 
 /*
@@ -379,6 +391,8 @@ extern int  xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 
 extern int     xfs_dev_is_read_only(struct xfs_mount *, char *);
 
+extern void    xfs_set_low_space_thresholds(struct xfs_mount *);
+
 #endif /* __KERNEL__ */
 
 extern void    xfs_mod_sb(struct xfs_trans *, __int64_t);
index f6d956b..f80a067 100644 (file)
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
  * they could be immediately flushed and we'd have to race with the flusher
  * trying to pull the item from the AIL as we add it.
  */
-void
+static void
 xfs_trans_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               commit_lsn,
@@ -1425,6 +1425,83 @@ xfs_trans_committed(
        xfs_trans_free(tp);
 }
 
+static inline void
+xfs_log_item_batch_insert(
+       struct xfs_ail          *ailp,
+       struct xfs_log_item     **log_items,
+       int                     nr_items,
+       xfs_lsn_t               commit_lsn)
+{
+       int     i;
+
+       spin_lock(&ailp->xa_lock);
+       /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+       xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
+
+       for (i = 0; i < nr_items; i++)
+               IOP_UNPIN(log_items[i], 0);
+}
+
+/*
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ */
+void
+xfs_trans_committed_bulk(
+       struct xfs_ail          *ailp,
+       struct xfs_log_vec      *log_vector,
+       xfs_lsn_t               commit_lsn,
+       int                     aborted)
+{
+#define LOG_ITEM_BATCH_SIZE    32
+       struct xfs_log_item     *log_items[LOG_ITEM_BATCH_SIZE];
+       struct xfs_log_vec      *lv;
+       int                     i = 0;
+
+       /* unpin all the log items */
+       for (lv = log_vector; lv; lv = lv->lv_next ) {
+               struct xfs_log_item     *lip = lv->lv_item;
+               xfs_lsn_t               item_lsn;
+
+               if (aborted)
+                       lip->li_flags |= XFS_LI_ABORTED;
+               item_lsn = IOP_COMMITTED(lip, commit_lsn);
+
+               /* item_lsn of -1 means the item was freed */
+               if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+                       continue;
+
+               if (item_lsn != commit_lsn) {
+
+                       /*
+                        * Not a bulk update option due to unusual item_lsn.
+                        * Push into AIL immediately, rechecking the lsn once
+                        * we have the ail lock. Then unpin the item.
+                        */
+                       spin_lock(&ailp->xa_lock);
+                       if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+                               xfs_trans_ail_update(ailp, lip, item_lsn);
+                       else
+                               spin_unlock(&ailp->xa_lock);
+                       IOP_UNPIN(lip, 0);
+                       continue;
+               }
+
+               /* Item is a candidate for bulk AIL insert.  */
+               log_items[i++] = lv->lv_item;
+               if (i >= LOG_ITEM_BATCH_SIZE) {
+                       xfs_log_item_batch_insert(ailp, log_items,
+                                       LOG_ITEM_BATCH_SIZE, commit_lsn);
+                       i = 0;
+               }
+       }
+
+       /* make sure we insert the remainder! */
+       if (i)
+               xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
+}
+
 /*
  * Called from the trans_commit code when we notice that
  * the filesystem is in the middle of a forced shutdown.
index 246286b..c2042b7 100644 (file)
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
 #define        XFS_ALLOC_BTREE_REF     2
 #define        XFS_BMAP_BTREE_REF      2
 #define        XFS_DIR_BTREE_REF       2
+#define        XFS_INO_REF             2
 #define        XFS_ATTR_BTREE_REF      1
-#define        XFS_INO_REF             1
 #define        XFS_DQUOT_REF           1
 
 #ifdef __KERNEL__
index dc90695..c5bbbc4 100644 (file)
@@ -28,8 +28,8 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 
-STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
+STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
+STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
 STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
 STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
 
@@ -449,129 +449,152 @@ xfs_trans_unlocked_item(
                xfs_log_move_tail(ailp->xa_mount, 1);
 }      /* xfs_trans_unlocked_item */
 
-
 /*
- * Update the position of the item in the AIL with the new
- * lsn.  If it is not yet in the AIL, add it.  Otherwise, move
- * it to its new position by removing it and re-adding it.
+ * xfs_trans_ail_update - bulk AIL insertion operation.
+ *
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added.  Otherwise, it will be repositioned  by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
  *
- * Wakeup anyone with an lsn less than the item's lsn.  If the item
- * we move in the AIL is the minimum one, update the tail lsn in the
- * log manager.
+ * This function takes the AIL lock once to execute the update operations on
+ * all the items in the array, and as such should not be called with the AIL
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
  *
- * This function must be called with the AIL lock held.  The lock
- * is dropped before returning.
+ * To optimise the insert operation, we delete all the items from the AIL in
+ * the first pass, moving them into a temporary list, then splice the temporary
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
  */
 void
-xfs_trans_ail_update(
-       struct xfs_ail  *ailp,
-       xfs_log_item_t  *lip,
-       xfs_lsn_t       lsn) __releases(ailp->xa_lock)
+xfs_trans_ail_update_bulk(
+       struct xfs_ail          *ailp,
+       struct xfs_log_item     **log_items,
+       int                     nr_items,
+       xfs_lsn_t               lsn) __releases(ailp->xa_lock)
 {
-       xfs_log_item_t          *dlip = NULL;
-       xfs_log_item_t          *mlip;  /* ptr to minimum lip */
+       xfs_log_item_t          *mlip;
        xfs_lsn_t               tail_lsn;
+       int                     mlip_changed = 0;
+       int                     i;
+       LIST_HEAD(tmp);
 
        mlip = xfs_ail_min(ailp);
 
-       if (lip->li_flags & XFS_LI_IN_AIL) {
-               dlip = xfs_ail_delete(ailp, lip);
-               ASSERT(dlip == lip);
-               xfs_trans_ail_cursor_clear(ailp, dlip);
-       } else {
-               lip->li_flags |= XFS_LI_IN_AIL;
+       for (i = 0; i < nr_items; i++) {
+               struct xfs_log_item *lip = log_items[i];
+               if (lip->li_flags & XFS_LI_IN_AIL) {
+                       /* check if we really need to move the item */
+                       if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
+                               continue;
+
+                       xfs_ail_delete(ailp, lip);
+                       if (mlip == lip)
+                               mlip_changed = 1;
+               } else {
+                       lip->li_flags |= XFS_LI_IN_AIL;
+               }
+               lip->li_lsn = lsn;
+               list_add(&lip->li_ail, &tmp);
        }
 
-       lip->li_lsn = lsn;
-       xfs_ail_insert(ailp, lip);
+       xfs_ail_splice(ailp, &tmp, lsn);
 
-       if (mlip == dlip) {
-               mlip = xfs_ail_min(ailp);
-               /*
-                * It is not safe to access mlip after the AIL lock is
-                * dropped, so we must get a copy of li_lsn before we do
-                * so.  This is especially important on 32-bit platforms
-                * where accessing and updating 64-bit values like li_lsn
-                * is not atomic.
-                */
-               tail_lsn = mlip->li_lsn;
-               spin_unlock(&ailp->xa_lock);
-               xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-       } else {
+       if (!mlip_changed) {
                spin_unlock(&ailp->xa_lock);
+               return;
        }
 
-
-}      /* xfs_trans_update_ail */
+       /*
+        * It is not safe to access mlip after the AIL lock is dropped, so we
+        * must get a copy of li_lsn before we do so.  This is especially
+        * important on 32-bit platforms where accessing and updating 64-bit
+        * values like li_lsn is not atomic.
+        */
+       mlip = xfs_ail_min(ailp);
+       tail_lsn = mlip->li_lsn;
+       spin_unlock(&ailp->xa_lock);
+       xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 
 /*
- * Delete the given item from the AIL.  It must already be in
- * the AIL.
+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
  *
- * Wakeup anyone with an lsn less than item's lsn.    If the item
- * we delete in the AIL is the minimum one, update the tail lsn in the
- * log manager.
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
+ * removed from the AIL. The caller is already holding the AIL lock, and done
+ * all the checks necessary to ensure the items passed in via @log_items are
+ * ready for deletion. This includes checking that the items are in the AIL.
  *
- * Clear the IN_AIL flag from the item, reset its lsn to 0, and
- * bump the AIL's generation count to indicate that the tree
- * has changed.
+ * For each log item to be removed, unlink it  from the AIL, clear the IN_AIL
+ * flag from the item and reset the item's lsn to 0. If we remove the first
+ * item in the AIL, update the log tail to match the new minimum LSN in the
+ * AIL.
  *
- * This function must be called with the AIL lock held.  The lock
- * is dropped before returning.
+ * This function will not drop the AIL lock until all items are removed from
+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
+ * greatly increase the AIL hold time, but does significantly reduce the amount
+ * of traffic on the lock, especially during IO completion.
+ *
+ * This function must be called with the AIL lock held.  The lock is dropped
+ * before returning.
  */
 void
-xfs_trans_ail_delete(
-       struct xfs_ail  *ailp,
-       xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+xfs_trans_ail_delete_bulk(
+       struct xfs_ail          *ailp,
+       struct xfs_log_item     **log_items,
+       int                     nr_items) __releases(ailp->xa_lock)
 {
-       xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
        xfs_lsn_t               tail_lsn;
+       int                     mlip_changed = 0;
+       int                     i;
 
-       if (lip->li_flags & XFS_LI_IN_AIL) {
-               mlip = xfs_ail_min(ailp);
-               dlip = xfs_ail_delete(ailp, lip);
-               ASSERT(dlip == lip);
-               xfs_trans_ail_cursor_clear(ailp, dlip);
-
+       mlip = xfs_ail_min(ailp);
 
-               lip->li_flags &= ~XFS_LI_IN_AIL;
-               lip->li_lsn = 0;
+       for (i = 0; i < nr_items; i++) {
+               struct xfs_log_item *lip = log_items[i];
+               if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+                       struct xfs_mount        *mp = ailp->xa_mount;
 
-               if (mlip == dlip) {
-                       mlip = xfs_ail_min(ailp);
-                       /*
-                        * It is not safe to access mlip after the AIL lock
-                        * is dropped, so we must get a copy of li_lsn
-                        * before we do so.  This is especially important
-                        * on 32-bit platforms where accessing and updating
-                        * 64-bit values like li_lsn is not atomic.
-                        */
-                       tail_lsn = mlip ? mlip->li_lsn : 0;
-                       spin_unlock(&ailp->xa_lock);
-                       xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-               } else {
                        spin_unlock(&ailp->xa_lock);
+                       if (!XFS_FORCED_SHUTDOWN(mp)) {
+                               xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+               "%s: attempting to delete a log item that is not in the AIL",
+                                               __func__);
+                               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+                       }
+                       return;
                }
+
+               xfs_ail_delete(ailp, lip);
+               lip->li_flags &= ~XFS_LI_IN_AIL;
+               lip->li_lsn = 0;
+               if (mlip == lip)
+                       mlip_changed = 1;
        }
-       else {
-               /*
-                * If the file system is not being shutdown, we are in
-                * serious trouble if we get to this stage.
-                */
-               struct xfs_mount        *mp = ailp->xa_mount;
 
+       if (!mlip_changed) {
                spin_unlock(&ailp->xa_lock);
-               if (!XFS_FORCED_SHUTDOWN(mp)) {
-                       xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
-               "%s: attempting to delete a log item that is not in the AIL",
-                                       __func__);
-                       xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-               }
+               return;
        }
-}
-
 
+       /*
+        * It is not safe to access mlip after the AIL lock is dropped, so we
+        * must get a copy of li_lsn before we do so.  This is especially
+        * important on 32-bit platforms where accessing and updating 64-bit
+        * values like li_lsn is not atomic. It is possible we've emptied the
+        * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
+        */
+       mlip = xfs_ail_min(ailp);
+       tail_lsn = mlip ? mlip->li_lsn : 0;
+       spin_unlock(&ailp->xa_lock);
+       xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 
 /*
  * The active item list (AIL) is a doubly linked list of log
@@ -623,16 +646,13 @@ xfs_trans_ail_destroy(
 }
 
 /*
- * Insert the given log item into the AIL.
- * We almost always insert at the end of the list, so on inserts
- * we search from the end of the list to find where the
- * new item belongs.
+ * splice the log item list into the AIL at the given LSN.
  */
 STATIC void
-xfs_ail_insert(
+xfs_ail_splice(
        struct xfs_ail  *ailp,
-       xfs_log_item_t  *lip)
-/* ARGSUSED */
+       struct list_head *list,
+       xfs_lsn_t       lsn)
 {
        xfs_log_item_t  *next_lip;
 
@@ -640,39 +660,33 @@ xfs_ail_insert(
         * If the list is empty, just insert the item.
         */
        if (list_empty(&ailp->xa_ail)) {
-               list_add(&lip->li_ail, &ailp->xa_ail);
+               list_splice(list, &ailp->xa_ail);
                return;
        }
 
        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-               if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
+               if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
                        break;
        }
 
        ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
-              (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
-
-       list_add(&lip->li_ail, &next_lip->li_ail);
+              (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
 
-       xfs_ail_check(ailp, lip);
+       list_splice_init(list, &next_lip->li_ail);
        return;
 }
 
 /*
  * Delete the given item from the AIL.  Return a pointer to the item.
  */
-/*ARGSUSED*/
-STATIC xfs_log_item_t *
+STATIC void
 xfs_ail_delete(
        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
-/* ARGSUSED */
 {
        xfs_ail_check(ailp, lip);
-
        list_del(&lip->li_ail);
-
-       return lip;
+       xfs_trans_ail_cursor_clear(ailp, lip);
 }
 
 /*
@@ -682,7 +696,6 @@ xfs_ail_delete(
 STATIC xfs_log_item_t *
 xfs_ail_min(
        struct xfs_ail  *ailp)
-/* ARGSUSED */
 {
        if (list_empty(&ailp->xa_ail))
                return NULL;
@@ -699,7 +712,6 @@ STATIC xfs_log_item_t *
 xfs_ail_next(
        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
-/* ARGSUSED */
 {
        if (lip->li_ail.next == &ailp->xa_ail)
                return NULL;
index f783d5e..f7590f5 100644 (file)
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t                *tp,
        tp->t_flags |= XFS_TRANS_DIRTY;
        efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 
-       next_extent = efip->efi_next_extent;
+       /*
+        * atomic_inc_return gives us the value after the increment;
+        * we want to use it as an array index so we need to subtract 1 from
+        * it.
+        */
+       next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
        ASSERT(next_extent < efip->efi_format.efi_nextents);
        extp = &(efip->efi_format.efi_extents[next_extent]);
        extp->ext_start = start_block;
        extp->ext_len = ext_len;
-       efip->efi_next_extent++;
 }
 
 
index 62da86c..35162c2 100644 (file)
@@ -22,15 +22,17 @@ struct xfs_log_item;
 struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
 
 void   xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void   xfs_trans_del_item(struct xfs_log_item *);
 void   xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
                                int flags);
-void   xfs_trans_item_committed(struct xfs_log_item *lip,
-                               xfs_lsn_t commit_lsn, int aborted);
 void   xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
 
+void   xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+                               xfs_lsn_t commit_lsn, int aborted);
 /*
  * AIL traversal cursor.
  *
@@ -73,12 +75,29 @@ struct xfs_ail {
 /*
  * From xfs_trans_ail.c
  */
-void                   xfs_trans_ail_update(struct xfs_ail *ailp,
-                                       struct xfs_log_item *lip, xfs_lsn_t lsn)
-                                       __releases(ailp->xa_lock);
-void                   xfs_trans_ail_delete(struct xfs_ail *ailp,
-                                       struct xfs_log_item *lip)
-                                       __releases(ailp->xa_lock);
+void   xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
+                               struct xfs_log_item **log_items, int nr_items,
+                               xfs_lsn_t lsn) __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_update(
+       struct xfs_ail          *ailp,
+       struct xfs_log_item     *lip,
+       xfs_lsn_t               lsn) __releases(ailp->xa_lock)
+{
+       xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
+}
+
+void   xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+                               struct xfs_log_item **log_items, int nr_items)
+                               __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_delete(
+       struct xfs_ail  *ailp,
+       xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+{
+       xfs_trans_ail_delete_bulk(ailp, &lip, 1);
+}
+
 void                   xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
 void                   xfs_trans_unlocked_item(struct xfs_ail *,
                                        xfs_log_item_t *);
index 8e4a63c..d8e6f8c 100644 (file)
@@ -964,29 +964,48 @@ xfs_release(
                        xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
        }
 
-       if (ip->i_d.di_nlink != 0) {
-               if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                    ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                      ip->i_delayed_blks > 0)) &&
-                    (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                   (!(ip->i_d.di_flags &
-                               (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
+       if (ip->i_d.di_nlink == 0)
+               return 0;
 
-                       /*
-                        * If we can't get the iolock just skip truncating
-                        * the blocks past EOF because we could deadlock
-                        * with the mmap_sem otherwise.  We'll get another
-                        * chance to drop them once the last reference to
-                        * the inode is dropped, so we'll never leak blocks
-                        * permanently.
-                        */
-                       error = xfs_free_eofblocks(mp, ip,
-                                                  XFS_FREE_EOF_TRYLOCK);
-                       if (error)
-                               return error;
-               }
-       }
+       if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+            ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
+              ip->i_delayed_blks > 0)) &&
+            (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
+           (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
 
+               /*
+                * If we can't get the iolock just skip truncating the blocks
+                * past EOF because we could deadlock with the mmap_sem
+                * otherwise.  We'll get another chance to drop them once the
+                * last reference to the inode is dropped, so we'll never leak
+                * blocks permanently.
+                *
+                * Further, check if the inode is being opened, written and
+                * closed frequently and we have delayed allocation blocks
+                * oustanding (e.g. streaming writes from the NFS server),
+                * truncating the blocks past EOF will cause fragmentation to
+                * occur.
+                *
+                * In this case don't do the truncation, either, but we have to
+                * be careful how we detect this case. Blocks beyond EOF show
+                * up as i_delayed_blks even when the inode is clean, so we
+                * need to truncate them away first before checking for a dirty
+                * release. Hence on the first dirty close we will still remove
+                * the speculative allocation, but after that we will leave it
+                * in place.
+                */
+               if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+                       return 0;
+
+               error = xfs_free_eofblocks(mp, ip,
+                                          XFS_FREE_EOF_TRYLOCK);
+               if (error)
+                       return error;
+
+               /* delalloc blocks after truncation means it really is dirty */
+               if (ip->i_delayed_blks)
+                       xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+       }
        return 0;
 }