[PATCH] Fix a race condition between ->i_mapping and iput()
authorOGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Thu, 22 Jun 2006 21:47:21 +0000 (14:47 -0700)
committerLinus Torvalds <torvalds@g5.osdl.org>
Thu, 22 Jun 2006 22:05:57 +0000 (15:05 -0700)
This race became a cause of oops, and can reproduce by the following.

    while true; do
dd if=/dev/zero of=/dev/.static/dev/hdg1 bs=512 count=1000 & sync
    done

This race condition was between __sync_single_inode() and iput().

          cpu0 (fs's inode)                 cpu1 (bdev's inode)
          -----------------                 -------------------
                                       close("/dev/hda2")
                                       [...]
__sync_single_inode()
   /* copy the bdev's ->i_mapping */
   mapping = inode->i_mapping;

                                       generic_forget_inode()
                                          bdev_clear_inode()
     /* restre the fs's ->i_mapping */
             inode->i_mapping = &inode->i_data;
          /* bdev's inode was freed */
                                          destroy_inode(inode);

   if (wait) {
      /* dereference a freed bdev's mapping->host */
      filemap_fdatawait(mapping);  /* Oops */

Since __sync_single_inode() is only taking a ref-count of fs's inode, the
another process can be close() and freeing the bdev's inode while writing
fs's inode.  So, __sync_signle_inode() accesses the freed ->i_mapping,
oops.

This patch takes a ref-count on the bdev's inode for the fs's inode before
setting a ->i_mapping, and the clear_inode() of the fs's inode does iput() on
the bdev's inode.  So if the fs's inode is still living, bdev's inode
shouldn't be freed.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
fs/block_dev.c

index f5958f4..44aaba2 100644 (file)
@@ -414,21 +414,31 @@ EXPORT_SYMBOL(bdput);
 static struct block_device *bd_acquire(struct inode *inode)
 {
        struct block_device *bdev;
+
        spin_lock(&bdev_lock);
        bdev = inode->i_bdev;
-       if (bdev && igrab(bdev->bd_inode)) {
+       if (bdev) {
+               atomic_inc(&bdev->bd_inode->i_count);
                spin_unlock(&bdev_lock);
                return bdev;
        }
        spin_unlock(&bdev_lock);
+
        bdev = bdget(inode->i_rdev);
        if (bdev) {
                spin_lock(&bdev_lock);
-               if (inode->i_bdev)
-                       __bd_forget(inode);
-               inode->i_bdev = bdev;
-               inode->i_mapping = bdev->bd_inode->i_mapping;
-               list_add(&inode->i_devices, &bdev->bd_inodes);
+               if (!inode->i_bdev) {
+                       /*
+                        * We take an additional bd_inode->i_count for inode,
+                        * and it's released in clear_inode() of inode.
+                        * So, we can access it via ->i_mapping always
+                        * without igrab().
+                        */
+                       atomic_inc(&bdev->bd_inode->i_count);
+                       inode->i_bdev = bdev;
+                       inode->i_mapping = bdev->bd_inode->i_mapping;
+                       list_add(&inode->i_devices, &bdev->bd_inodes);
+               }
                spin_unlock(&bdev_lock);
        }
        return bdev;
@@ -438,10 +448,18 @@ static struct block_device *bd_acquire(struct inode *inode)
 
 void bd_forget(struct inode *inode)
 {
+       struct block_device *bdev = NULL;
+
        spin_lock(&bdev_lock);
-       if (inode->i_bdev)
+       if (inode->i_bdev) {
+               if (inode->i_sb != blockdev_superblock)
+                       bdev = inode->i_bdev;
                __bd_forget(inode);
+       }
        spin_unlock(&bdev_lock);
+
+       if (bdev)
+               iput(bdev->bd_inode);
 }
 
 int bd_claim(struct block_device *bdev, void *holder)