From: Christoph Hellwig Date: Fri, 24 Jun 2011 18:29:43 +0000 (-0400) Subject: fs: kill i_alloc_sem X-Git-Tag: v3.1-rc1~282^2~32 X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=bd5fe6c5eb9c548d7f07fe8f89a150bb6705e8e3;p=pandora-kernel.git fs: kill i_alloc_sem i_alloc_sem is a rather special rw_semaphore. It's the last one that may be released by a non-owner, and it's write side is always mirrored by real exclusion. It's intended use it to wait for all pending direct I/O requests to finish before starting a truncate. Replace it with a hand-grown construct: - exclusion for truncates is already guaranteed by i_mutex, so it can simply fall way - the reader side is replaced by an i_dio_count member in struct inode that counts the number of pending direct I/O requests. Truncate can't proceed as long as it's non-zero - when i_dio_count reaches non-zero we wake up a pending truncate using wake_up_bit on a new bit in i_flags - new references to i_dio_count can't appear while we are waiting for it to read zero because the direct I/O count always needs i_mutex (or an equivalent like XFS's i_iolock) for starting a new operation. This scheme is much simpler, and saves the space of a spinlock_t and a struct list_head in struct inode (typically 160 bits on a non-debug 64-bit system). Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- diff --git a/fs/attr.c b/fs/attr.c index caf2aa521e2b..f177ac86fa48 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -233,16 +233,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr) return error; if (ia_valid & ATTR_SIZE) - down_write(&dentry->d_inode->i_alloc_sem); + inode_dio_wait(inode); if (inode->i_op->setattr) error = inode->i_op->setattr(dentry, attr); else error = simple_setattr(dentry, attr); - if (ia_valid & ATTR_SIZE) - up_write(&dentry->d_inode->i_alloc_sem); - if (!error) fsnotify_change(dentry, ia_valid); diff --git a/fs/direct-io.c b/fs/direct-io.c index 98ce3ac0d94b..354cbdbc14bd 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -135,6 +135,50 @@ struct dio { struct page *pages[DIO_PAGES]; /* page buffer */ }; +static void __inode_dio_wait(struct inode *inode) +{ + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); + DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); + + do { + prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(&inode->i_dio_count)) + schedule(); + } while (atomic_read(&inode->i_dio_count)); + finish_wait(wq, &q.wait); +} + +/** + * inode_dio_wait - wait for outstanding DIO requests to finish + * @inode: inode to wait for + * + * Waits for all pending direct I/O requests to finish so that we can + * proceed with a truncate or equivalent operation. + * + * Must be called under a lock that serializes taking new references + * to i_dio_count, usually by inode->i_mutex. + */ +void inode_dio_wait(struct inode *inode) +{ + if (atomic_read(&inode->i_dio_count)) + __inode_dio_wait(inode); +} +EXPORT_SYMBOL_GPL(inode_dio_wait); + +/* + * inode_dio_done - signal finish of a direct I/O requests + * @inode: inode the direct I/O happens on + * + * This is called once we've finished processing a direct I/O request, + * and is used to wake up callers waiting for direct I/O to be quiesced. + */ +void inode_dio_done(struct inode *inode) +{ + if (atomic_dec_and_test(&inode->i_dio_count)) + wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); +} +EXPORT_SYMBOL_GPL(inode_dio_done); + /* * How many pages are in the queue? */ @@ -254,9 +298,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is } if (dio->flags & DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); - + inode_dio_done(dio->inode); return ret; } @@ -980,9 +1022,6 @@ out: return ret; } -/* - * Releases both i_mutex and i_alloc_sem - */ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, @@ -1146,15 +1185,14 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * For writes this function is called under i_mutex and returns with * i_mutex held, for reads, i_mutex is not held on entry, but it is * taken and dropped again before returning. - * For reads and writes i_alloc_sem is taken in shared mode and released - * on I/O completion (which may happen asynchronously after returning to - * the caller). + * The i_dio_count counter keeps track of the number of outstanding + * direct I/O requests, and truncate waits for it to reach zero. + * New references to i_dio_count must only be grabbed with i_mutex + * held. * * - if the flags value does NOT contain DIO_LOCKING we don't use any * internal locking but rather rely on the filesystem to synchronize * direct I/O reads/writes versus each other and truncate. - * For reads and writes both i_mutex and i_alloc_sem are not held on - * entry and are never taken. */ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, @@ -1234,10 +1272,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, } /* - * Will be released at I/O completion, possibly in a - * different thread. + * Will be decremented at I/O completion time. */ - down_read_non_owner(&inode->i_alloc_sem); + atomic_inc(&inode->i_dio_count); } /* diff --git a/fs/inode.c b/fs/inode.c index cf81baf1898a..96c77b81167c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -168,8 +168,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mutex_init(&inode->i_mutex); lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); - init_rwsem(&inode->i_alloc_sem); - lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); + atomic_set(&inode->i_dio_count, 0); mapping->a_ops = &empty_aops; mapping->host = inode; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index f4b1057abdd2..b59f5ac26bef 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1832,9 +1832,8 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, * fails again. */ if (unlikely(NInoTruncateFailed(ni))) { - down_write(&vi->i_alloc_sem); + inode_dio_wait(vi); err = ntfs_truncate(vi); - up_write(&vi->i_alloc_sem); if (err || NInoTruncateFailed(ni)) { if (!err) err = -EIO; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index c05d6dcf77a4..1371487da955 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2357,12 +2357,7 @@ static const char *es = " Leaving inconsistent metadata. Unmount and run " * * Returns 0 on success or -errno on error. * - * Called with ->i_mutex held. In all but one case ->i_alloc_sem is held for - * writing. The only case in the kernel where ->i_alloc_sem is not held is - * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called - * with the current i_size as the offset. The analogous place in NTFS is in - * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again - * without holding ->i_alloc_sem. + * Called with ->i_mutex held. */ int ntfs_truncate(struct inode *vi) { @@ -2887,8 +2882,7 @@ void ntfs_truncate_vfs(struct inode *vi) { * We also abort all changes of user, group, and mode as we do not implement * the NTFS ACLs yet. * - * Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also - * called with ->i_alloc_sem held for writing. + * Called with ->i_mutex held. */ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) { diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ac97bca282d2..de1d3953599d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -551,9 +551,8 @@ bail: /* * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're - * particularly interested in the aio/dio case. Like the core uses - * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from - * truncation on another. + * particularly interested in the aio/dio case. We use the rw_lock DLM lock + * to protect io on one node from truncation on another. */ static void ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, @@ -569,7 +568,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); if (ocfs2_iocb_is_sem_locked(iocb)) { - up_read(&inode->i_alloc_sem); + inode_dio_done(inode); ocfs2_iocb_clear_sem_locked(iocb); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1406c37a5722..2c3a465514a2 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2236,9 +2236,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, ocfs2_iocb_clear_sem_locked(iocb); relock: - /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ + /* to match setattr's i_mutex -> rw_lock ordering */ if (direct_io) { - down_read(&inode->i_alloc_sem); + atomic_inc(&inode->i_dio_count); have_alloc_sem = 1; /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_sem_locked(iocb); @@ -2290,7 +2290,7 @@ relock: */ if (direct_io && !can_do_direct) { ocfs2_rw_unlock(inode, rw_level); - up_read(&inode->i_alloc_sem); + inode_dio_done(inode); have_alloc_sem = 0; rw_level = -1; @@ -2361,8 +2361,7 @@ out_dio: /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * function pointer which is called when o_direct io completes so that - * it can unlock our rw lock. (it's the clustered equivalent of - * i_alloc_sem; protects truncate from racing with pending ios). + * it can unlock our rw lock. * Unfortunately there are error cases which call end_io and others * that don't. so we don't have to unlock the rw_lock if either an * async dio is going to do it in the future or an end_io after an @@ -2379,7 +2378,7 @@ out: out_sems: if (have_alloc_sem) { - up_read(&inode->i_alloc_sem); + inode_dio_done(inode); ocfs2_iocb_clear_sem_locked(iocb); } @@ -2531,8 +2530,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, * need locks to protect pending reads from racing with truncate. */ if (filp->f_flags & O_DIRECT) { - down_read(&inode->i_alloc_sem); have_alloc_sem = 1; + atomic_inc(&inode->i_dio_count); ocfs2_iocb_set_sem_locked(iocb); ret = ocfs2_rw_lock(inode, 0); @@ -2575,7 +2574,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, bail: if (have_alloc_sem) { - up_read(&inode->i_alloc_sem); + inode_dio_done(inode); ocfs2_iocb_clear_sem_locked(iocb); } if (rw_level != -1) diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4ea2ab41fdee..6938d8c68d6e 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -555,11 +555,10 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, reiserfs_write_unlock(inode->i_sb); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); - down_write(&dentry->d_inode->i_alloc_sem); + inode_dio_wait(dentry->d_inode); reiserfs_write_lock(inode->i_sb); err = reiserfs_setattr(dentry, &newattrs); - up_write(&dentry->d_inode->i_alloc_sem); mutex_unlock(&dentry->d_inode->i_mutex); } else update_ctime(inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index 1393742bba9b..2fe920774abf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -779,7 +779,7 @@ struct inode { struct timespec i_ctime; blkcnt_t i_blocks; unsigned short i_bytes; - struct rw_semaphore i_alloc_sem; + atomic_t i_dio_count; const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct file_lock *i_flock; struct address_space *i_mapping; @@ -1705,6 +1705,10 @@ struct super_operations { * set during data writeback, and cleared with a wakeup * on the bit address once it is done. * + * I_REFERENCED Marks the inode as recently references on the LRU list. + * + * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -1718,6 +1722,8 @@ struct super_operations { #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) #define I_REFERENCED (1 << 8) +#define __I_DIO_WAKEUP 9 +#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) @@ -1828,7 +1834,6 @@ struct file_system_type { struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; struct lock_class_key i_mutex_dir_key; - struct lock_class_key i_alloc_sem_key; }; extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags, @@ -2404,6 +2409,8 @@ enum { }; void dio_end_io(struct bio *bio, int error); +void inode_dio_wait(struct inode *inode); +void inode_dio_done(struct inode *inode); ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, diff --git a/mm/filemap.c b/mm/filemap.c index a8251a8d3457..f820e600f1ad 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -78,9 +78,6 @@ * ->i_mutex (generic_file_buffered_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * - * ->i_mutex - * ->i_alloc_sem (various) - * * inode_wb_list_lock * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) diff --git a/mm/madvise.c b/mm/madvise.c index 2221491ed503..74bf193eff04 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma, endoff = (loff_t)(end - vma->vm_start - 1) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ + /* vmtruncate_range needs to take i_mutex */ up_read(¤t->mm->mmap_sem); error = vmtruncate_range(mapping->host, offset, endoff); down_read(¤t->mm->mmap_sem); diff --git a/mm/rmap.c b/mm/rmap.c index 23295f65ae43..2540a39eea4a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -21,7 +21,6 @@ * Lock ordering in mm: * * inode->i_mutex (while writing or truncating, not reading or faulting) - * inode->i_alloc_sem (vmtruncate_range) * mm->mmap_sem * page->flags PG_locked (lock_page) * mapping->i_mmap_mutex diff --git a/mm/truncate.c b/mm/truncate.c index e13f22efaad7..003c6c685fc8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -622,12 +622,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) return -ENOSYS; mutex_lock(&inode->i_mutex); - down_write(&inode->i_alloc_sem); + inode_dio_wait(inode); unmap_mapping_range(mapping, offset, (end - offset), 1); inode->i_op->truncate_range(inode, offset, end); /* unmap again to remove racily COWed private pages */ unmap_mapping_range(mapping, offset, (end - offset), 1); - up_write(&inode->i_alloc_sem); mutex_unlock(&inode->i_mutex); return 0;