xfs: convert inode cache lookups to use RCU locking

author Dave Chinner <dchinner@redhat.com>

Fri, 17 Dec 2010 06:29:43 +0000 (17:29 +1100)

committer Dave Chinner <david@fromorbit.com>

Fri, 17 Dec 2010 06:29:43 +0000 (17:29 +1100)
author Dave Chinner <dchinner@redhat.com>
Fri, 17 Dec 2010 06:29:43 +0000 (17:29 +1100)
committer Dave Chinner <david@fromorbit.com>
Fri, 17 Dec 2010 06:29:43 +0000 (17:29 +1100)
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c

index afb0d7c..fd38682 100644 (file)
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
  {
         struct inode            *inode = VFS_I(ip);
  
+       ASSERT(rcu_read_lock_held());
+
+       /*
+        * check for stale RCU freed inode
+        *
+        * If the inode has been reallocated, it doesn't matter if it's not in
+        * the AG we are walking - we are walking for writeback, so if it
+        * passes all the "valid inode" checks and is dirty, then we'll write
+        * it back anyway.  If it has been reallocated and still being
+        * initialised, the XFS_INEW check below will catch it.
+        */
+       spin_lock(&ip->i_flags_lock);
+       if (!ip->i_ino)
+               goto out_unlock_noent;
+
+       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+       if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+               goto out_unlock_noent;
+       spin_unlock(&ip->i_flags_lock);
+
         /* nothing to sync during shutdown */
         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                 return EFSCORRUPTED;
  
-       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-       if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-               return ENOENT;
-
         /* If we can't grab the inode, it must on it's way to reclaim. */
         if (!igrab(inode))
                 return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
  
         /* inode is valid */
         return 0;
+
+out_unlock_noent:
+       spin_unlock(&ip->i_flags_lock);
+       return ENOENT;
  }
  
  STATIC int
@@ -98,12 +118,12 @@ restart:
                 int             error = 0;
                 int             i;
  
-               read_lock(&pag->pag_ici_lock);
+               rcu_read_lock();
                 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                         (void **)batch, first_index,
                                         XFS_LOOKUP_BATCH);
                 if (!nr_found) {
-                       read_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
                         break;
                 }
  
@@ -118,18 +138,26 @@ restart:
                                 batch[i] = NULL;
  
                         /*
-                        * Update the index for the next lookup. Catch overflows
-                        * into the next AG range which can occur if we have inodes
-                        * in the last block of the AG and we are currently
-                        * pointing to the last inode.
+                        * Update the index for the next lookup. Catch
+                        * overflows into the next AG range which can occur if
+                        * we have inodes in the last block of the AG and we
+                        * are currently pointing to the last inode.
+                        *
+                        * Because we may see inodes that are from the wrong AG
+                        * due to RCU freeing and reallocation, only update the
+                        * index if it lies in this AG. It was a race that lead
+                        * us to see this inode, so another lookup from the
+                        * same index will not find it again.
                          */
+                       if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                               continue;
                         first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                         if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                 done = 1;
                 }
  
                 /* unlock now we've grabbed the inodes. */
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
  
                 for (i = 0; i < nr_found; i++) {
                         if (!batch[i])
@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
         struct xfs_inode        *ip,
         int                     flags)
  {
+       ASSERT(rcu_read_lock_held());
+
+       /* quick check for stale RCU freed inode */
+       if (!ip->i_ino)
+               return 1;
  
         /*
-        * do some unlocked checks first to avoid unnecceary lock traffic.
+        * do some unlocked checks first to avoid unnecessary lock traffic.
          * The first is a flush lock check, the second is a already in reclaim
          * check. Only do these checks if we are not going to block on locks.
          */
@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
          * The radix tree lock here protects a thread in xfs_iget from racing
          * with us starting reclaim on the inode.  Once we have the
          * XFS_IRECLAIM flag set it will not touch us.
+        *
+        * Due to RCU lookup, we may find inodes that have been freed and only
+        * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+        * aren't candidates for reclaim at all, so we must check the
+        * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
          */
         spin_lock(&ip->i_flags_lock);
-       ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-       if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
-               /* ignore as it is already under reclaim */
+       if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+           __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+               /* not a reclaim candidate. */
                 spin_unlock(&ip->i_flags_lock);
                 return 1;
         }
@@ -864,14 +902,14 @@ restart:
                         struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                         int     i;
  
-                       write_lock(&pag->pag_ici_lock);
+                       rcu_read_lock();
                         nr_found = radix_tree_gang_lookup_tag(
                                         &pag->pag_ici_root,
                                         (void **)batch, first_index,
                                         XFS_LOOKUP_BATCH,
                                         XFS_ICI_RECLAIM_TAG);
                         if (!nr_found) {
-                               write_unlock(&pag->pag_ici_lock);
+                               rcu_read_unlock();
                                 break;
                         }
  
@@ -891,14 +929,24 @@ restart:
                                  * occur if we have inodes in the last block of
                                  * the AG and we are currently pointing to the
                                  * last inode.
+                                *
+                                * Because we may see inodes that are from the
+                                * wrong AG due to RCU freeing and
+                                * reallocation, only update the index if it
+                                * lies in this AG. It was a race that lead us
+                                * to see this inode, so another lookup from
+                                * the same index will not find it again.
                                  */
+                               if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                               pag->pag_agno)
+                                       continue;
                                 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                                 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                         done = 1;
                         }
  
                         /* unlock now we've grabbed the inodes. */
-                       write_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
  
                         for (i = 0; i < nr_found; i++) {
                                 if (!batch[i])
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c

index 9fae475..04ed09b 100644 (file)
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -80,6 +80,7 @@ xfs_inode_alloc(
         ASSERT(atomic_read(&ip->i_pincount) == 0);
         ASSERT(!spin_is_locked(&ip->i_flags_lock));
         ASSERT(completion_done(&ip->i_flush));
+       ASSERT(ip->i_ino == 0);
  
         mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
         lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
@@ -98,9 +99,6 @@ xfs_inode_alloc(
         ip->i_size = 0;
         ip->i_new_size = 0;
  
-       /* prevent anyone from using this yet */
-       VFS_I(ip)->i_state = I_NEW;
-
         return ip;
  }
  
@@ -159,6 +157,16 @@ xfs_inode_free(
         ASSERT(!spin_is_locked(&ip->i_flags_lock));
         ASSERT(completion_done(&ip->i_flush));
  
+       /*
+        * Because we use RCU freeing we need to ensure the inode always
+        * appears to be reclaimed with an invalid inode number when in the
+        * free state. The ip->i_flags_lock provides the barrier against lookup
+        * races.
+        */
+       spin_lock(&ip->i_flags_lock);
+       ip->i_flags = XFS_IRECLAIM;
+       ip->i_ino = 0;
+       spin_unlock(&ip->i_flags_lock);
         call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free);
  }
  
@@ -169,14 +177,29 @@ static int
  xfs_iget_cache_hit(
         struct xfs_perag        *pag,
         struct xfs_inode        *ip,
+       xfs_ino_t               ino,
         int                     flags,
-       int                     lock_flags) __releases(pag->pag_ici_lock)
+       int                     lock_flags) __releases(RCU)
  {
         struct inode            *inode = VFS_I(ip);
         struct xfs_mount        *mp = ip->i_mount;
         int                     error;
  
+       /*
+        * check for re-use of an inode within an RCU grace period due to the
+        * radix tree nodes not being updated yet. We monitor for this by
+        * setting the inode number to zero before freeing the inode structure.
+        * If the inode has been reallocated and set up, then the inode number
+        * will not match, so check for that, too.
+        */
         spin_lock(&ip->i_flags_lock);
+       if (ip->i_ino != ino) {
+               trace_xfs_iget_skip(ip);
+               XFS_STATS_INC(xs_ig_frecycle);
+               error = EAGAIN;
+               goto out_error;
+       }
+
  
         /*
          * If we are racing with another cache hit that is currently
@@ -219,7 +242,7 @@ xfs_iget_cache_hit(
                 ip->i_flags |= XFS_IRECLAIM;
  
                 spin_unlock(&ip->i_flags_lock);
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
  
                 error = -inode_init_always(mp->m_super, inode);
                 if (error) {
@@ -227,7 +250,7 @@ xfs_iget_cache_hit(
                          * Re-initializing the inode failed, and we are in deep
                          * trouble.  Try to re-add it to the reclaim list.
                          */
-                       read_lock(&pag->pag_ici_lock);
+                       rcu_read_lock();
                         spin_lock(&ip->i_flags_lock);
  
                         ip->i_flags &= ~XFS_INEW;
@@ -261,7 +284,7 @@ xfs_iget_cache_hit(
  
                 /* We've got a live one. */
                 spin_unlock(&ip->i_flags_lock);
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
                 trace_xfs_iget_hit(ip);
         }
  
@@ -275,7 +298,7 @@ xfs_iget_cache_hit(
  
  out_error:
         spin_unlock(&ip->i_flags_lock);
-       read_unlock(&pag->pag_ici_lock);
+       rcu_read_unlock();
         return error;
  }
  
@@ -397,7 +420,7 @@ xfs_iget(
         xfs_agino_t     agino;
  
         /* reject inode numbers outside existing AGs */
-       if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+       if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                 return EINVAL;
  
         /* get the perag structure and ensure that it's inode capable */
@@ -406,15 +429,15 @@ xfs_iget(
  
  again:
         error = 0;
-       read_lock(&pag->pag_ici_lock);
+       rcu_read_lock();
         ip = radix_tree_lookup(&pag->pag_ici_root, agino);
  
         if (ip) {
-               error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+               error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
                 if (error)
                         goto out_error_or_again;
         } else {
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
                 XFS_STATS_INC(xs_ig_missed);
  
                 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index 108c7a0..43ffd90 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2000,16 +2000,32 @@ xfs_ifree_cluster(
                  */
                 for (i = 0; i < ninodes; i++) {
  retry:
-                       read_lock(&pag->pag_ici_lock);
+                       rcu_read_lock();
                         ip = radix_tree_lookup(&pag->pag_ici_root,
                                         XFS_INO_TO_AGINO(mp, (inum + i)));
  
-                       /* Inode not in memory or stale, nothing to do */
-                       if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
-                               read_unlock(&pag->pag_ici_lock);
+                       /* Inode not in memory, nothing to do */
+                       if (!ip) {
+                               rcu_read_unlock();
                                 continue;
                         }
  
+                       /*
+                        * because this is an RCU protected lookup, we could
+                        * find a recently freed or even reallocated inode
+                        * during the lookup. We need to check under the
+                        * i_flags_lock for a valid inode here. Skip it if it
+                        * is not valid, the wrong inode or stale.
+                        */
+                       spin_lock(&ip->i_flags_lock);
+                       if (ip->i_ino != inum + i ||
+                           __xfs_iflags_test(ip, XFS_ISTALE)) {
+                               spin_unlock(&ip->i_flags_lock);
+                               rcu_read_unlock();
+                               continue;
+                       }
+                       spin_unlock(&ip->i_flags_lock);
+
                         /*
                          * Don't try to lock/unlock the current inode, but we
                          * _cannot_ skip the other inodes that we did not find
@@ -2019,11 +2035,11 @@ retry:
                          */
                         if (ip != free_ip &&
                             !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                               read_unlock(&pag->pag_ici_lock);
+                               rcu_read_unlock();
                                 delay(1);
                                 goto retry;
                         }
-                       read_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
  
                         xfs_iflock(ip);
                         xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
  
         mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
         first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-       read_lock(&pag->pag_ici_lock);
+       rcu_read_lock();
         /* really need a gang lookup range call here */
         nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
                                         first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
                 iq = ilist[i];
                 if (iq == ip)
                         continue;
-               /* if the inode lies outside this cluster, we're done. */
-               if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
-                       break;
+
+               /*
+                * because this is an RCU protected lookup, we could find a
+                * recently freed or even reallocated inode during the lookup.
+                * We need to check under the i_flags_lock for a valid inode
+                * here. Skip it if it is not valid or the wrong inode.
+                */
+               spin_lock(&ip->i_flags_lock);
+               if (!ip->i_ino ||
+                   (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                       spin_unlock(&ip->i_flags_lock);
+                       continue;
+               }
+               spin_unlock(&ip->i_flags_lock);
+
                 /*
                  * Do an un-protected check to see if the inode is dirty and
                  * is a candidate for flushing.  These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
         }
  
  out_free:
-       read_unlock(&pag->pag_ici_lock);
+       rcu_read_unlock();
         kmem_free(ilist);
  out_put:
         xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
          * Corruption detected in the clustering loop.  Invalidate the
          * inode buffer and shut down the filesystem.
          */
-       read_unlock(&pag->pag_ici_lock);
+       rcu_read_unlock();
         /*
          * Clean up the buffer.  If it was B_DELWRI, just release it --
          * brelse can handle it with no problems.  If not, shut down the
author	Dave Chinner <dchinner@redhat.com>
	Fri, 17 Dec 2010 06:29:43 +0000 (17:29 +1100)
committer	Dave Chinner <david@fromorbit.com>
	Fri, 17 Dec 2010 06:29:43 +0000 (17:29 +1100)
fs/xfs/linux-2.6/xfs_sync.c		patch \| blob \| history
fs/xfs/xfs_iget.c		patch \| blob \| history
fs/xfs/xfs_inode.c		patch \| blob \| history