Merge branch 'globalheartbeat-2' of git://oss.oracle.com/git/smushran/linux-2.6 into...
authorJoel Becker <joel.becker@oracle.com>
Fri, 15 Oct 2010 20:03:09 +0000 (13:03 -0700)
committerJoel Becker <joel.becker@oracle.com>
Fri, 15 Oct 2010 20:03:09 +0000 (13:03 -0700)
Conflicts:
fs/ocfs2/ocfs2.h

29 files changed:
Documentation/filesystems/ocfs2.txt
fs/ext3/super.c
fs/ext4/super.c
fs/jbd2/journal.c
fs/libfs.c
fs/ocfs2/aops.c
fs/ocfs2/aops.h
fs/ocfs2/dcache.c
fs/ocfs2/dcache.h
fs/ocfs2/dlm/dlmdebug.c
fs/ocfs2/dlmglue.c
fs/ocfs2/file.c
fs/ocfs2/inode.c
fs/ocfs2/inode.h
fs/ocfs2/ioctl.c
fs/ocfs2/journal.c
fs/ocfs2/journal.h
fs/ocfs2/mmap.c
fs/ocfs2/namei.c
fs/ocfs2/ocfs2.h
fs/ocfs2/ocfs2_fs.h
fs/ocfs2/ocfs2_ioctl.h
fs/ocfs2/refcounttree.c
fs/ocfs2/refcounttree.h
fs/ocfs2/slot_map.c
fs/ocfs2/suballoc.c
fs/ocfs2/super.c
fs/ocfs2/sysfile.c
include/linux/fs.h

index 1f7ae14..5393e66 100644 (file)
@@ -87,3 +87,10 @@ dir_resv_level=      (*)     By default, directory reservations will scale with file
                        reservations - users should rarely need to change this
                        value. If allocation reservations are turned off, this
                        option will have no effect.
+coherency=full  (*)    Disallow concurrent O_DIRECT writes, cluster inode
+                       lock will be taken to force other nodes drop cache,
+                       therefore full cluster coherency is guaranteed even
+                       for O_DIRECT writes.
+coherency=buffered     Allow concurrent O_DIRECT writes without EX lock among
+                       nodes, which gains high performance at risk of getting
+                       stale data on other nodes.
index 5dbf4db..a367dd0 100644 (file)
@@ -1849,8 +1849,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
 
-       if (le32_to_cpu(es->s_blocks_count) >
-                   (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+       if (generic_check_addressable(sb->s_blocksize_bits,
+                                     le32_to_cpu(es->s_blocks_count))) {
                ext3_msg(sb, KERN_ERR,
                        "error: filesystem is too large to mount safely");
                if (sizeof(sector_t) < 8)
index 2614774..7f47c36 100644 (file)
@@ -2831,15 +2831,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * Test whether we have more sectors than will fit in sector_t,
         * and whether the max offset is addressable by the page cache.
         */
-       if ((ext4_blocks_count(es) >
-            (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
-           (ext4_blocks_count(es) >
-            (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
+       ret = generic_check_addressable(sb->s_blocksize_bits,
+                                       ext4_blocks_count(es));
+       if (ret) {
                ext4_msg(sb, KERN_ERR, "filesystem"
                         " too large to mount safely on this system");
                if (sizeof(sector_t) < 8)
                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
-               ret = -EFBIG;
                goto failed_mount;
        }
 
index 0e8014e..262419f 100644 (file)
@@ -1371,6 +1371,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
 
        if (!compat && !ro && !incompat)
                return 1;
+       /* Load journal superblock if it is not loaded yet. */
+       if (journal->j_format_version == 0 &&
+           journal_get_superblock(journal) != 0)
+               return 0;
        if (journal->j_format_version == 1)
                return 0;
 
index 0a9da95..62baa03 100644 (file)
@@ -913,6 +913,35 @@ int generic_file_fsync(struct file *file, int datasync)
 }
 EXPORT_SYMBOL(generic_file_fsync);
 
+/**
+ * generic_check_addressable - Check addressability of file system
+ * @blocksize_bits:    log of file system block size
+ * @num_blocks:                number of blocks in file system
+ *
+ * Determine whether a file system with @num_blocks blocks (and a
+ * block size of 2**@blocksize_bits) is addressable by the sector_t
+ * and page cache of the system.  Return 0 if so and -EFBIG otherwise.
+ */
+int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
+{
+       u64 last_fs_block = num_blocks - 1;
+       u64 last_fs_page =
+               last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
+
+       if (unlikely(num_blocks == 0))
+               return 0;
+
+       if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
+               return -EINVAL;
+
+       if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
+           (last_fs_page > (pgoff_t)(~0ULL))) {
+               return -EFBIG;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(generic_check_addressable);
+
 /*
  * No-op implementation of ->fsync for in-memory filesystems.
  */
index 0de69c9..5cfeee1 100644 (file)
@@ -883,8 +883,8 @@ struct ocfs2_write_ctxt {
         * out in so that future reads from that region will get
         * zero's.
         */
-       struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
        unsigned int                    w_num_pages;
+       struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
        struct page                     *w_target_page;
 
        /*
@@ -1642,7 +1642,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
        return ret;
 }
 
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+                            struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned flags,
                             struct page **pagep, void **fsdata,
                             struct buffer_head *di_bh, struct page *mmap_page)
@@ -1692,7 +1693,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                mlog_errno(ret);
                goto out;
        } else if (ret == 1) {
-               ret = ocfs2_refcount_cow(inode, di_bh,
+               ret = ocfs2_refcount_cow(inode, filp, di_bh,
                                         wc->w_cpos, wc->w_clen, UINT_MAX);
                if (ret) {
                        mlog_errno(ret);
@@ -1854,7 +1855,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
         */
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-       ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+       ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
                                       fsdata, di_bh, NULL);
        if (ret) {
                mlog_errno(ret);
index c48e93f..7606f66 100644 (file)
@@ -48,7 +48,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                           loff_t pos, unsigned len, unsigned copied,
                           struct page *page, void *fsdata);
 
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+                            struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned flags,
                             struct page **pagep, void **fsdata,
                             struct buffer_head *di_bh, struct page *mmap_page);
index b4957c7..edaded4 100644 (file)
 #include "inode.h"
 #include "super.h"
 
+void ocfs2_dentry_attach_gen(struct dentry *dentry)
+{
+       unsigned long gen =
+               OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+       BUG_ON(dentry->d_inode);
+       dentry->d_fsdata = (void *)gen;
+}
+
 
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
                                   struct nameidata *nd)
@@ -51,11 +59,20 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
 
-       /* Never trust a negative dentry - force a new lookup. */
+       /* For a negative dentry -
+        * check the generation number of the parent and compare with the
+        * one stored in the inode.
+        */
        if (inode == NULL) {
-               mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
-                    dentry->d_name.name);
-               goto bail;
+               unsigned long gen = (unsigned long) dentry->d_fsdata;
+               unsigned long pgen =
+                       OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+               mlog(0, "negative dentry: %.*s parent gen: %lu "
+                       "dentry gen: %lu\n",
+                       dentry->d_name.len, dentry->d_name.name, pgen, gen);
+               if (gen != pgen)
+                       goto bail;
+               goto valid;
        }
 
        BUG_ON(!osb);
@@ -96,6 +113,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
                goto bail;
        }
 
+valid:
        ret = 1;
 
 bail:
@@ -227,6 +245,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
        if (!inode)
                return 0;
 
+       if (!dentry->d_inode && dentry->d_fsdata) {
+               /* Converting a negative dentry to positive
+                  Clear dentry->d_fsdata */
+               dentry->d_fsdata = dl = NULL;
+       }
+
        if (dl) {
                mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
                                " \"%.*s\": old parent: %llu, new: %llu\n",
@@ -452,6 +476,7 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
 
 out:
        iput(inode);
+       ocfs2_dentry_attach_gen(dentry);
 }
 
 /*
index f5dd178..b79eff7 100644 (file)
@@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
                       struct inode *old_dir, struct inode *new_dir);
 
 extern spinlock_t dentry_attach_lock;
+void ocfs2_dentry_attach_gen(struct dentry *dentry);
 
 #endif /* OCFS2_DCACHE_H */
index f693ab8..272ec86 100644 (file)
@@ -493,7 +493,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        struct hlist_head *bucket;
        struct hlist_node *list;
        int i, out = 0;
-       unsigned long total = 0, longest = 0, bktcnt;
+       unsigned long total = 0, longest = 0, bucket_count = 0;
 
        out += snprintf(db->buf + out, db->len - out,
                        "Dumping MLEs for Domain: %s\n", dlm->name);
@@ -505,13 +505,13 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                        mle = hlist_entry(list, struct dlm_master_list_entry,
                                          master_hash_node);
                        ++total;
-                       ++bktcnt;
+                       ++bucket_count;
                        if (db->len - out < 200)
                                continue;
                        out += dump_mle(mle, db->buf + out, db->len - out);
                }
-               longest = max(longest, bktcnt);
-               bktcnt = 0;
+               longest = max(longest, bucket_count);
+               bucket_count = 0;
        }
        spin_unlock(&dlm->master_lock);
 
index 5e02a89..e8d94d7 100644 (file)
@@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
 {
        struct inode *inode;
        struct address_space *mapping;
+       struct ocfs2_inode_info *oi;
 
                inode = ocfs2_lock_res_inode(lockres);
        mapping = inode->i_mapping;
 
+       if (S_ISDIR(inode->i_mode)) {
+               oi = OCFS2_I(inode);
+               oi->ip_dir_lock_gen++;
+               mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
+               goto out;
+       }
+
        if (!S_ISREG(inode->i_mode))
                goto out;
 
index 9a03c15..9e8cc43 100644 (file)
 
 #include "buffer_head_io.h"
 
-static int ocfs2_sync_inode(struct inode *inode)
-{
-       filemap_fdatawrite(inode->i_mapping);
-       return sync_mapping_buffers(inode->i_mapping);
-}
-
 static int ocfs2_init_file_private(struct inode *inode, struct file *file)
 {
        struct ocfs2_file_private *fp;
@@ -180,16 +174,12 @@ static int ocfs2_sync_file(struct file *file, int datasync)
 {
        int err = 0;
        journal_t *journal;
-       struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = file->f_mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-       mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
-                  dentry->d_name.len, dentry->d_name.name);
-
-       err = ocfs2_sync_inode(dentry->d_inode);
-       if (err)
-               goto bail;
+       mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
+                  file->f_path.dentry, file->f_path.dentry->d_name.len,
+                  file->f_path.dentry->d_name.name);
 
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
                /*
@@ -370,7 +360,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
        if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
                goto out;
 
-       return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+       return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
 
 out:
        return status;
@@ -913,8 +903,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
                zero_clusters = last_cpos - zero_cpos;
 
        if (needs_cow) {
-               rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
-                                       UINT_MAX);
+               rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
+                                       zero_clusters, UINT_MAX);
                if (rc) {
                        mlog_errno(rc);
                        goto out;
@@ -2062,6 +2052,7 @@ out:
 }
 
 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+                                           struct file *file,
                                            loff_t pos, size_t count,
                                            int *meta_level)
 {
@@ -2079,7 +2070,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
 
        *meta_level = 1;
 
-       ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
+       ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
        if (ret)
                mlog_errno(ret);
 out:
@@ -2087,7 +2078,7 @@ out:
        return ret;
 }
 
-static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
+static int ocfs2_prepare_inode_for_write(struct file *file,
                                         loff_t *ppos,
                                         size_t count,
                                         int appending,
@@ -2095,6 +2086,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                         int *has_refcount)
 {
        int ret = 0, meta_level = 0;
+       struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        loff_t saved_pos, end;
 
@@ -2150,6 +2142,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                        meta_level = -1;
 
                        ret = ocfs2_prepare_inode_for_refcount(inode,
+                                                              file,
                                                               saved_pos,
                                                               count,
                                                               &meta_level);
@@ -2232,6 +2225,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       int full_coherency = !(osb->s_mount_opt &
+                              OCFS2_MOUNT_COHERENCY_BUFFERED);
 
        mlog_entry("(0x%p, %u, '%.*s')\n", file,
                   (unsigned int)nr_segs,
@@ -2255,16 +2250,39 @@ relock:
                have_alloc_sem = 1;
        }
 
-       /* concurrent O_DIRECT writes are allowed */
-       rw_level = !direct_io;
+       /*
+        * Concurrent O_DIRECT writes are allowed with
+        * mount_option "coherency=buffered".
+        */
+       rw_level = (!direct_io || full_coherency);
+
        ret = ocfs2_rw_lock(inode, rw_level);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_sems;
        }
 
+       /*
+        * O_DIRECT writes with "coherency=full" need to take EX cluster
+        * inode_lock to guarantee coherency.
+        */
+       if (direct_io && full_coherency) {
+               /*
+                * We need to take and drop the inode lock to force
+                * other nodes to drop their caches.  Buffered I/O
+                * already does this in write_begin().
+                */
+               ret = ocfs2_inode_lock(inode, NULL, 1);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out_sems;
+               }
+
+               ocfs2_inode_unlock(inode, 1);
+       }
+
        can_do_direct = direct_io;
-       ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+       ret = ocfs2_prepare_inode_for_write(file, ppos,
                                            iocb->ki_left, appending,
                                            &can_do_direct, &has_refcount);
        if (ret < 0) {
@@ -2312,17 +2330,6 @@ relock:
                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
                                                    ppos, count, ocount);
                if (written < 0) {
-                       /*
-                        * direct write may have instantiated a few
-                        * blocks outside i_size. Trim these off again.
-                        * Don't need i_size_read because we hold i_mutex.
-                        *
-                        * XXX(truncate): this looks buggy because ocfs2 did not
-                        * actually implement ->truncate.  Take a look at
-                        * the new truncate sequence and update this accordingly
-                        */
-                       if (*ppos + count > inode->i_size)
-                               truncate_setsize(inode, inode->i_size);
                        ret = written;
                        goto out_dio;
                }
@@ -2394,7 +2401,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
 {
        int ret;
 
-       ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
+       ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
                                            sd->total_len, 0, NULL, NULL);
        if (ret < 0) {
                mlog_errno(ret);
index eece3e0..f935fd6 100644 (file)
@@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                    else
                            inode->i_fop = &ocfs2_dops_no_plocks;
                    i_size_write(inode, le64_to_cpu(fe->i_size));
+                   OCFS2_I(inode)->ip_dir_lock_gen = 1;
                    break;
            case S_IFLNK:
                    if (ocfs2_inode_is_fast_symlink(inode))
index 6de5a86..1c508b1 100644 (file)
@@ -46,30 +46,28 @@ struct ocfs2_inode_info
        /* These fields are protected by ip_lock */
        spinlock_t                      ip_lock;
        u32                             ip_open_count;
-       u32                             ip_clusters;
        struct list_head                ip_io_markers;
+       u32                             ip_clusters;
 
+       u16                             ip_dyn_features;
        struct mutex                    ip_io_mutex;
-
        u32                             ip_flags; /* see below */
        u32                             ip_attr; /* inode attributes */
-       u16                             ip_dyn_features;
 
        /* protected by recovery_lock. */
        struct inode                    *ip_next_orphan;
 
-       u32                             ip_dir_start_lookup;
-
        struct ocfs2_caching_info       ip_metadata_cache;
-
        struct ocfs2_extent_map         ip_extent_map;
-
        struct inode                    vfs_inode;
        struct jbd2_inode               ip_jinode;
 
+       u32                             ip_dir_start_lookup;
+
        /* Only valid if the inode is the dir. */
        u32                             ip_last_used_slot;
        u64                             ip_last_used_group;
+       u32                             ip_dir_lock_gen;
 
        struct ocfs2_alloc_reservation  ip_la_data_resv;
 };
index 7d9d9c1..7a48681 100644 (file)
 
 #include <linux/ext2_fs.h>
 
+#define o2info_from_user(a, b) \
+               copy_from_user(&(a), (b), sizeof(a))
+#define o2info_to_user(a, b)   \
+               copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
+
+/*
+ * This call is void because we are already reporting an error that may
+ * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
+ * just a best-effort to tell userspace that this request caused the error.
+ */
+static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+                                       struct ocfs2_info_request __user *req)
+{
+       kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
+       (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
+}
+
+#define o2info_set_request_error(a, b) \
+               __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
+
 static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
        int status;
@@ -109,6 +129,328 @@ bail:
        return status;
 }
 
+int ocfs2_info_handle_blocksize(struct inode *inode,
+                               struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_blocksize oib;
+
+       if (o2info_from_user(oib, req))
+               goto bail;
+
+       oib.ib_blocksize = inode->i_sb->s_blocksize;
+       oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oib, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oib, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_clustersize(struct inode *inode,
+                                 struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_clustersize oic;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (o2info_from_user(oic, req))
+               goto bail;
+
+       oic.ic_clustersize = osb->s_clustersize;
+       oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oic, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oic, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_maxslots(struct inode *inode,
+                              struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_maxslots oim;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (o2info_from_user(oim, req))
+               goto bail;
+
+       oim.im_max_slots = osb->max_slots;
+       oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oim, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oim, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_label(struct inode *inode,
+                           struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_label oil;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (o2info_from_user(oil, req))
+               goto bail;
+
+       memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
+       oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oil, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oil, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_uuid(struct inode *inode,
+                          struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_uuid oiu;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (o2info_from_user(oiu, req))
+               goto bail;
+
+       memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
+       oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oiu, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oiu, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_fs_features(struct inode *inode,
+                                 struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_fs_features oif;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (o2info_from_user(oif, req))
+               goto bail;
+
+       oif.if_compat_features = osb->s_feature_compat;
+       oif.if_incompat_features = osb->s_feature_incompat;
+       oif.if_ro_compat_features = osb->s_feature_ro_compat;
+       oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oif, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oif, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_journal_size(struct inode *inode,
+                                  struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_journal_size oij;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (o2info_from_user(oij, req))
+               goto bail;
+
+       oij.ij_journal_size = osb->journal->j_inode->i_size;
+
+       oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oij, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oij, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_unknown(struct inode *inode,
+                             struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_request oir;
+
+       if (o2info_from_user(oir, req))
+               goto bail;
+
+       oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oir, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oir, req);
+
+       return status;
+}
+
+/*
+ * Validate and distinguish OCFS2_IOC_INFO requests.
+ *
+ * - validate the magic number.
+ * - distinguish different requests.
+ * - validate size of different requests.
+ */
+int ocfs2_info_handle_request(struct inode *inode,
+                             struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_request oir;
+
+       if (o2info_from_user(oir, req))
+               goto bail;
+
+       status = -EINVAL;
+       if (oir.ir_magic != OCFS2_INFO_MAGIC)
+               goto bail;
+
+       switch (oir.ir_code) {
+       case OCFS2_INFO_BLOCKSIZE:
+               if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
+                       status = ocfs2_info_handle_blocksize(inode, req);
+               break;
+       case OCFS2_INFO_CLUSTERSIZE:
+               if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
+                       status = ocfs2_info_handle_clustersize(inode, req);
+               break;
+       case OCFS2_INFO_MAXSLOTS:
+               if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
+                       status = ocfs2_info_handle_maxslots(inode, req);
+               break;
+       case OCFS2_INFO_LABEL:
+               if (oir.ir_size == sizeof(struct ocfs2_info_label))
+                       status = ocfs2_info_handle_label(inode, req);
+               break;
+       case OCFS2_INFO_UUID:
+               if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
+                       status = ocfs2_info_handle_uuid(inode, req);
+               break;
+       case OCFS2_INFO_FS_FEATURES:
+               if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
+                       status = ocfs2_info_handle_fs_features(inode, req);
+               break;
+       case OCFS2_INFO_JOURNAL_SIZE:
+               if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
+                       status = ocfs2_info_handle_journal_size(inode, req);
+               break;
+       default:
+               status = ocfs2_info_handle_unknown(inode, req);
+               break;
+       }
+
+bail:
+       return status;
+}
+
+int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+                         u64 *req_addr, int compat_flag)
+{
+       int status = -EFAULT;
+       u64 __user *bp = NULL;
+
+       if (compat_flag) {
+#ifdef CONFIG_COMPAT
+               /*
+                * pointer bp stores the base address of a pointers array,
+                * which collects all addresses of separate request.
+                */
+               bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
+#else
+               BUG();
+#endif
+       } else
+               bp = (u64 __user *)(unsigned long)(info->oi_requests);
+
+       if (o2info_from_user(*req_addr, bp + idx))
+               goto bail;
+
+       status = 0;
+bail:
+       return status;
+}
+
+/*
+ * OCFS2_IOC_INFO handles an array of requests passed from userspace.
+ *
+ * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * validate the request count from header, then break it into small
+ * pieces, later specific handlers can handle them one by one.
+ *
+ * Idea here is to make each separate request small enough to ensure
+ * a better backward&forward compatibility, since a small piece of
+ * request will be less likely to be broken if disk layout get changed.
+ */
+int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+                     int compat_flag)
+{
+       int i, status = 0;
+       u64 req_addr;
+       struct ocfs2_info_request __user *reqp;
+
+       if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
+           (!info->oi_requests)) {
+               status = -EINVAL;
+               goto bail;
+       }
+
+       for (i = 0; i < info->oi_count; i++) {
+
+               status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
+               if (status)
+                       break;
+
+               reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
+               if (!reqp) {
+                       status = -EINVAL;
+                       goto bail;
+               }
+
+               status = ocfs2_info_handle_request(inode, reqp);
+               if (status)
+                       break;
+       }
+
+bail:
+       return status;
+}
+
 long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        struct reflink_arguments args;
        const char *old_path, *new_path;
        bool preserve;
+       struct ocfs2_info info;
 
        switch (cmd) {
        case OCFS2_IOC_GETFLAGS:
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                preserve = (args.preserve != 0);
 
                return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+       case OCFS2_IOC_INFO:
+               if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+                                  sizeof(struct ocfs2_info)))
+                       return -EFAULT;
+
+               return ocfs2_info_handle(inode, &info, 0);
        default:
                return -ENOTTY;
        }
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        bool preserve;
        struct reflink_arguments args;
        struct inode *inode = file->f_path.dentry->d_inode;
+       struct ocfs2_info info;
 
        switch (cmd) {
        case OCFS2_IOC32_GETFLAGS:
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 
                return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
                                           compat_ptr(args.new_path), preserve);
+       case OCFS2_IOC_INFO:
+               if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+                                  sizeof(struct ocfs2_info)))
+                       return -EFAULT;
+
+               return ocfs2_info_handle(inode, &info, 1);
        default:
                return -ENOIOCTLCMD;
        }
index 9b57c03..faa2303 100644 (file)
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 {
        int status = 0;
        unsigned int flushed;
-       unsigned long old_id;
        struct ocfs2_journal *journal = NULL;
 
        mlog_entry_void();
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
                goto finally;
        }
 
-       old_id = ocfs2_inc_trans_id(journal);
+       ocfs2_inc_trans_id(journal);
 
        flushed = atomic_read(&journal->j_num_trans);
        atomic_set(&journal->j_num_trans, 0);
@@ -342,9 +341,6 @@ finally:
        return status;
 }
 
-/* pass it NULL and it will allocate a new handle object for you.  If
- * you pass it a handle however, it may still return error, in which
- * case it has free'd the passed handle for you. */
 handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 {
        journal_t *journal = osb->journal->j_journal;
@@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 
        os = &osb->osb_orphan_scan;
 
+       mlog(0, "Begin orphan scan\n");
+
        if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
                goto out;
 
@@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 unlock:
        ocfs2_orphan_scan_unlock(osb, seqno);
 out:
+       mlog(0, "Orphan scan completed\n");
        return;
 }
 
index b5baaa8..43e56b9 100644 (file)
@@ -67,11 +67,12 @@ struct ocfs2_journal {
        struct buffer_head        *j_bh;      /* Journal disk inode block */
        atomic_t                  j_num_trans; /* Number of transactions
                                                * currently in the system. */
+       spinlock_t                j_lock;
        unsigned long             j_trans_id;
        struct rw_semaphore       j_trans_barrier;
        wait_queue_head_t         j_checkpointed;
 
-       spinlock_t                j_lock;
+       /* both fields protected by j_lock*/
        struct list_head          j_la_cleanups;
        struct work_struct        j_recovery_work;
 };
index 4c18f4a..7e32db9 100644 (file)
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
        return ret;
 }
 
-static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
                                struct page *page)
 {
        int ret;
+       struct inode *inode = file->f_path.dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
        loff_t pos = page_offset(page);
        unsigned int len = PAGE_CACHE_SIZE;
@@ -111,7 +112,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
        if (page->index == last_index)
                len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
 
-       ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+       ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
                                       &fsdata, di_bh, page);
        if (ret) {
                if (ret != -ENOSPC)
@@ -159,7 +160,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         */
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-       ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+       ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
 
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
index a00dda2..e7bde21 100644 (file)
@@ -171,7 +171,8 @@ bail_add:
                        ret = ERR_PTR(status);
                        goto bail_unlock;
                }
-       }
+       } else
+               ocfs2_dentry_attach_gen(dentry);
 
 bail_unlock:
        /* Don't drop the cluster lock until *after* the d_add --
index 481387b..d840821 100644 (file)
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
 struct ocfs2_lock_res {
        void                    *l_priv;
        struct ocfs2_lock_res_ops *l_ops;
-       spinlock_t               l_lock;
+
 
        struct list_head         l_blocked_list;
        struct list_head         l_mask_waiters;
 
-       enum ocfs2_lock_type     l_type;
        unsigned long            l_flags;
        char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
-       int                      l_level;
        unsigned int             l_ro_holders;
        unsigned int             l_ex_holders;
-       struct ocfs2_dlm_lksb    l_lksb;
+       unsigned char            l_level;
+
+       /* Data packed - type enum ocfs2_lock_type */
+       unsigned char            l_type;
 
        /* used from AST/BAST funcs. */
-       enum ocfs2_ast_action    l_action;
-       enum ocfs2_unlock_action l_unlock_action;
-       int                      l_requested;
-       int                      l_blocking;
+       /* Data packed - enum type ocfs2_ast_action */
+       unsigned char            l_action;
+       /* Data packed - enum type ocfs2_unlock_action */
+       unsigned char            l_unlock_action;
+       unsigned char            l_requested;
+       unsigned char            l_blocking;
        unsigned int             l_pending_gen;
 
+       spinlock_t               l_lock;
+
+       struct ocfs2_dlm_lksb    l_lksb;
+
        wait_queue_head_t        l_event;
 
        struct list_head         l_debug_list;
@@ -256,8 +263,10 @@ enum ocfs2_mount_options
                                                   control lists */
        OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
        OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
-       OCFS2_MOUNT_HB_NONE = 1 << 12, /* No heartbeat */
-       OCFS2_MOUNT_HB_GLOBAL = 1 << 13, /* Global heartbeat */
+       OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
+                                                    writes */
+       OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
+       OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
 };
 
 #define OCFS2_OSB_SOFT_RO                      0x0001
@@ -279,7 +288,8 @@ struct ocfs2_super
        struct super_block *sb;
        struct inode *root_inode;
        struct inode *sys_root_inode;
-       struct inode *system_inodes[NUM_SYSTEM_INODES];
+       struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
+       struct inode **local_system_inodes;
 
        struct ocfs2_slot_info *slot_info;
 
index 28ff536..c2e4f82 100644 (file)
@@ -338,6 +338,7 @@ enum {
        USER_QUOTA_SYSTEM_INODE,
        GROUP_QUOTA_SYSTEM_INODE,
 #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
+#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
        ORPHAN_DIR_SYSTEM_INODE,
        EXTENT_ALLOC_SYSTEM_INODE,
        INODE_ALLOC_SYSTEM_INODE,
@@ -346,8 +347,12 @@ enum {
        TRUNCATE_LOG_SYSTEM_INODE,
        LOCAL_USER_QUOTA_SYSTEM_INODE,
        LOCAL_GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
        NUM_SYSTEM_INODES
 };
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
+#define NUM_LOCAL_SYSTEM_INODES        \
+               (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
 
 static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
        /* Global system inodes (single copy) */
index 5d24150..b46f39b 100644 (file)
@@ -76,4 +76,99 @@ struct reflink_arguments {
 };
 #define OCFS2_IOC_REFLINK      _IOW('o', 4, struct reflink_arguments)
 
+/* Following definitions dedicated for ocfs2_info_request ioctls. */
+#define OCFS2_INFO_MAX_REQUEST         (50)
+#define OCFS2_TEXT_UUID_LEN            (OCFS2_VOL_UUID_LEN * 2)
+
+/* Magic number of all requests */
+#define OCFS2_INFO_MAGIC               (0x4F32494E)
+
+/*
+ * Always try to separate info request into small pieces to
+ * guarantee the backward&forward compatibility.
+ */
+struct ocfs2_info {
+       __u64 oi_requests;      /* Array of __u64 pointers to requests */
+       __u32 oi_count;         /* Number of requests in info_requests */
+       __u32 oi_pad;
+};
+
+struct ocfs2_info_request {
+/*00*/ __u32 ir_magic; /* Magic number */
+       __u32 ir_code;  /* Info request code */
+       __u32 ir_size;  /* Size of request */
+       __u32 ir_flags; /* Request flags */
+/*10*/                 /* Request specific fields */
+};
+
+struct ocfs2_info_clustersize {
+       struct ocfs2_info_request ic_req;
+       __u32 ic_clustersize;
+       __u32 ic_pad;
+};
+
+struct ocfs2_info_blocksize {
+       struct ocfs2_info_request ib_req;
+       __u32 ib_blocksize;
+       __u32 ib_pad;
+};
+
+struct ocfs2_info_maxslots {
+       struct ocfs2_info_request im_req;
+       __u32 im_max_slots;
+       __u32 im_pad;
+};
+
+struct ocfs2_info_label {
+       struct ocfs2_info_request il_req;
+       __u8    il_label[OCFS2_MAX_VOL_LABEL_LEN];
+} __attribute__ ((packed));
+
+struct ocfs2_info_uuid {
+       struct ocfs2_info_request iu_req;
+       __u8    iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
+} __attribute__ ((packed));
+
+struct ocfs2_info_fs_features {
+       struct ocfs2_info_request if_req;
+       __u32 if_compat_features;
+       __u32 if_incompat_features;
+       __u32 if_ro_compat_features;
+       __u32 if_pad;
+};
+
+struct ocfs2_info_journal_size {
+       struct ocfs2_info_request ij_req;
+       __u64 ij_journal_size;
+};
+
+/* Codes for ocfs2_info_request */
+enum ocfs2_info_type {
+       OCFS2_INFO_CLUSTERSIZE = 1,
+       OCFS2_INFO_BLOCKSIZE,
+       OCFS2_INFO_MAXSLOTS,
+       OCFS2_INFO_LABEL,
+       OCFS2_INFO_UUID,
+       OCFS2_INFO_FS_FEATURES,
+       OCFS2_INFO_JOURNAL_SIZE,
+       OCFS2_INFO_NUM_TYPES
+};
+
+/* Flags for struct ocfs2_info_request */
+/* Filled by the caller */
+#define OCFS2_INFO_FL_NON_COHERENT     (0x00000001)    /* Cluster coherency not
+                                                          required. This is a hint.
+                                                          It is up to ocfs2 whether
+                                                          the request can be fulfilled
+                                                          without locking. */
+/* Filled by ocfs2 */
+#define OCFS2_INFO_FL_FILLED           (0x40000000)    /* Filesystem understood
+                                                          this request and
+                                                          filled in the answer */
+
+#define OCFS2_INFO_FL_ERROR            (0x80000000)    /* Error happened during
+                                                          request handling. */
+
+#define OCFS2_IOC_INFO         _IOR('o', 5, struct ocfs2_info)
+
 #endif /* OCFS2_IOCTL_H */
index efdd756..b5f9160 100644 (file)
@@ -49,6 +49,7 @@
 
 struct ocfs2_cow_context {
        struct inode *inode;
+       struct file *file;
        u32 cow_start;
        u32 cow_len;
        struct ocfs2_extent_tree data_et;
@@ -2932,13 +2933,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
        u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
        struct page *page;
        pgoff_t page_index;
-       unsigned int from, to;
+       unsigned int from, to, readahead_pages;
        loff_t offset, end, map_end;
        struct address_space *mapping = context->inode->i_mapping;
 
        mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
             new_cluster, new_len, cpos);
 
+       readahead_pages =
+               (ocfs2_cow_contig_clusters(sb) <<
+                OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
        offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
        end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
        /*
@@ -2969,6 +2973,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
                        BUG_ON(PageDirty(page));
 
+               if (PageReadahead(page) && context->file) {
+                       page_cache_async_readahead(mapping,
+                                                  &context->file->f_ra,
+                                                  context->file,
+                                                  page, page_index,
+                                                  readahead_pages);
+               }
+
                if (!PageUptodate(page)) {
                        ret = block_read_full_page(page, ocfs2_get_block);
                        if (ret) {
@@ -3409,12 +3421,35 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
        return ret;
 }
 
+static void ocfs2_readahead_for_cow(struct inode *inode,
+                                   struct file *file,
+                                   u32 start, u32 len)
+{
+       struct address_space *mapping;
+       pgoff_t index;
+       unsigned long num_pages;
+       int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+
+       if (!file)
+               return;
+
+       mapping = file->f_mapping;
+       num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
+       if (!num_pages)
+               num_pages = 1;
+
+       index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
+       page_cache_sync_readahead(mapping, &file->f_ra, file,
+                                 index, num_pages);
+}
+
 /*
  * Starting at cpos, try to CoW write_len clusters.  Don't CoW
  * past max_cpos.  This will stop when it runs into a hole or an
  * unrefcounted extent.
  */
 static int ocfs2_refcount_cow_hunk(struct inode *inode,
+                                  struct file *file,
                                   struct buffer_head *di_bh,
                                   u32 cpos, u32 write_len, u32 max_cpos)
 {
@@ -3443,6 +3478,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
 
        BUG_ON(cow_len == 0);
 
+       ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
+
        context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
        if (!context) {
                ret = -ENOMEM;
@@ -3464,6 +3501,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
        context->ref_root_bh = ref_root_bh;
        context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
        context->get_clusters = ocfs2_di_get_clusters;
+       context->file = file;
 
        ocfs2_init_dinode_extent_tree(&context->data_et,
                                      INODE_CACHE(inode), di_bh);
@@ -3492,6 +3530,7 @@ out:
  * clusters between cpos and cpos+write_len are safe to modify.
  */
 int ocfs2_refcount_cow(struct inode *inode,
+                      struct file *file,
                       struct buffer_head *di_bh,
                       u32 cpos, u32 write_len, u32 max_cpos)
 {
@@ -3511,7 +3550,7 @@ int ocfs2_refcount_cow(struct inode *inode,
                        num_clusters = write_len;
 
                if (ext_flags & OCFS2_EXT_REFCOUNTED) {
-                       ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
+                       ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
                                                      num_clusters, max_cpos);
                        if (ret) {
                                mlog_errno(ret);
index 9983ba1..c8ce46f 100644 (file)
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
        struct rb_node rf_node;
        u64 rf_blkno;
        u32 rf_generation;
+       struct kref rf_getcnt;
        struct rw_semaphore rf_sem;
        struct ocfs2_lock_res rf_lockres;
-       struct kref rf_getcnt;
        int rf_removed;
 
        /* the following 4 fields are used by caching_info. */
-       struct ocfs2_caching_info rf_ci;
        spinlock_t rf_lock;
+       struct ocfs2_caching_info rf_ci;
        struct mutex rf_io_mutex;
        struct super_block *rf_sb;
 };
@@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
                                          u32 clusters,
                                          int *credits,
                                          int *ref_blocks);
-int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_refcount_cow(struct inode *inode,
+                      struct file *filep, struct buffer_head *di_bh,
                       u32 cpos, u32 write_len, u32 max_cpos);
 
 typedef int (ocfs2_post_refcount_func)(struct inode *inode,
index bfbd7e9..ab4e017 100644 (file)
@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 {
        int status = 0;
        u64 blkno;
-       unsigned long long blocks, bytes;
+       unsigned long long blocks, bytes = 0;
        unsigned int i;
        struct buffer_head *bh;
 
index 849c2f0..5fed60d 100644 (file)
@@ -1380,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        }
 
        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+       if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+               ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+                           " count %u but claims %u are freed. num_bits %d",
+                           (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                           le16_to_cpu(bg->bg_bits),
+                           le16_to_cpu(bg->bg_free_bits_count), num_bits);
+               return -EROFS;
+       }
        while(num_bits--)
                ocfs2_set_bit(bit_off++, bitmap);
 
@@ -2419,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
                                (unsigned long *) undo_bg->bg_bitmap);
        }
        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+       if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+               ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+                           " count %u but claims %u are freed. num_bits %d",
+                           (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                           le16_to_cpu(bg->bg_bits),
+                           le16_to_cpu(bg->bg_free_bits_count), num_bits);
+               return -EROFS;
+       }
 
        if (undo_fn)
                jbd_unlock_bh_state(group_bh);
index 4e009ad..a8a0ca4 100644 (file)
@@ -178,6 +178,8 @@ enum {
        Opt_noacl,
        Opt_usrquota,
        Opt_grpquota,
+       Opt_coherency_buffered,
+       Opt_coherency_full,
        Opt_resv_level,
        Opt_dir_resv_level,
        Opt_err,
@@ -207,6 +209,8 @@ static const match_table_t tokens = {
        {Opt_noacl, "noacl"},
        {Opt_usrquota, "usrquota"},
        {Opt_grpquota, "grpquota"},
+       {Opt_coherency_buffered, "coherency=buffered"},
+       {Opt_coherency_full, "coherency=full"},
        {Opt_resv_level, "resv_level=%u"},
        {Opt_dir_resv_level, "dir_resv_level=%u"},
        {Opt_err, NULL}
@@ -516,11 +520,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 
        mlog_entry_void();
 
-       for (i = 0; i < NUM_SYSTEM_INODES; i++) {
-               inode = osb->system_inodes[i];
+       for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
+               inode = osb->global_system_inodes[i];
                if (inode) {
                        iput(inode);
-                       osb->system_inodes[i] = NULL;
+                       osb->global_system_inodes[i] = NULL;
                }
        }
 
@@ -536,6 +540,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
                osb->root_inode = NULL;
        }
 
+       if (!osb->local_system_inodes)
+               goto out;
+
+       for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
+               if (osb->local_system_inodes[i]) {
+                       iput(osb->local_system_inodes[i]);
+                       osb->local_system_inodes[i] = NULL;
+               }
+       }
+
+       kfree(osb->local_system_inodes);
+       osb->local_system_inodes = NULL;
+
+out:
        mlog_exit(0);
 }
 
@@ -1452,6 +1470,12 @@ static int ocfs2_parse_options(struct super_block *sb,
                case Opt_grpquota:
                        mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
                        break;
+               case Opt_coherency_buffered:
+                       mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+                       break;
+               case Opt_coherency_full:
+                       mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+                       break;
                case Opt_acl:
                        mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
                        mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1563,6 +1587,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (opts & OCFS2_MOUNT_GRPQUOTA)
                seq_printf(s, ",grpquota");
 
+       if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+               seq_printf(s, ",coherency=buffered");
+       else
+               seq_printf(s, ",coherency=full");
+
        if (opts & OCFS2_MOUNT_NOUSERXATTR)
                seq_printf(s, ",nouser_xattr");
        else
@@ -2017,6 +2046,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
        return 0;
 }
 
+/* Make sure entire volume is addressable by our journal.  Requires
+   osb_clusters_at_boot to be valid and for the journal to have been
+   initialized by ocfs2_journal_init(). */
+static int ocfs2_journal_addressable(struct ocfs2_super *osb)
+{
+       int status = 0;
+       u64 max_block =
+               ocfs2_clusters_to_blocks(osb->sb,
+                                        osb->osb_clusters_at_boot) - 1;
+
+       /* 32-bit block number is always OK. */
+       if (max_block <= (u32)~0ULL)
+               goto out;
+
+       /* Volume is "huge", so see if our journal is new enough to
+          support it. */
+       if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+                                      OCFS2_FEATURE_COMPAT_JBD2_SB) &&
+             jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
+                                              JBD2_FEATURE_INCOMPAT_64BIT))) {
+               mlog(ML_ERROR, "The journal cannot address the entire volume. "
+                    "Enable the 'block64' journal option with tunefs.ocfs2");
+               status = -EFBIG;
+               goto out;
+       }
+
+ out:
+       return status;
+}
+
 static int ocfs2_initialize_super(struct super_block *sb,
                                  struct buffer_head *bh,
                                  int sector_size,
@@ -2029,6 +2088,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        struct ocfs2_journal *journal;
        __le32 uuid_net_key;
        struct ocfs2_super *osb;
+       u64 total_blocks;
 
        mlog_entry_void();
 
@@ -2087,6 +2147,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
        snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
                 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
 
+       osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+       if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+               mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+                    osb->max_slots);
+               status = -EINVAL;
+               goto bail;
+       }
+       mlog(0, "max_slots for this device: %u\n", osb->max_slots);
+
        ocfs2_orphan_scan_init(osb);
 
        status = ocfs2_recovery_init(osb);
@@ -2125,15 +2194,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
                goto bail;
        }
 
-       osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
-       if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
-               mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
-                    osb->max_slots);
-               status = -EINVAL;
-               goto bail;
-       }
-       mlog(0, "max_slots for this device: %u\n", osb->max_slots);
-
        osb->slot_recovery_generations =
                kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
                        GFP_KERNEL);
@@ -2243,11 +2303,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
                goto bail;
        }
 
-       if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
-           > (u32)~0UL) {
-               mlog(ML_ERROR, "Volume might try to write to blocks beyond "
-                    "what jbd can address in 32 bits.\n");
-               status = -EINVAL;
+       total_blocks = ocfs2_clusters_to_blocks(osb->sb,
+                                               le32_to_cpu(di->i_clusters));
+
+       status = generic_check_addressable(osb->sb->s_blocksize_bits,
+                                          total_blocks);
+       if (status) {
+               mlog(ML_ERROR, "Volume too large "
+                    "to mount safely on this system");
+               status = -EFBIG;
                goto bail;
        }
 
@@ -2409,6 +2473,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
                goto finally;
        }
 
+       /* Now that journal has been initialized, check to make sure
+          entire volume is addressable. */
+       status = ocfs2_journal_addressable(osb);
+       if (status)
+               goto finally;
+
        /* If the journal was unmounted cleanly then we don't want to
         * recover anything. Otherwise, journal_load will do that
         * dirty work for us :) */
index bfe7190..902efb2 100644 (file)
@@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
                                                   int type,
                                                   u32 slot);
 
-static inline int is_global_system_inode(int type);
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
-                                          int type,
-                                          u32 slot);
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
 #endif
@@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type)
                type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
 }
 
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
-                                          int type,
-                                          u32 slot)
+static struct inode **get_local_system_inode(struct ocfs2_super *osb,
+                                            int type,
+                                            u32 slot)
 {
-       return slot == osb->slot_num || is_global_system_inode(type);
+       int index;
+       struct inode **local_system_inodes, **free = NULL;
+
+       BUG_ON(slot == OCFS2_INVALID_SLOT);
+       BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
+              type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
+
+       spin_lock(&osb->osb_lock);
+       local_system_inodes = osb->local_system_inodes;
+       spin_unlock(&osb->osb_lock);
+
+       if (unlikely(!local_system_inodes)) {
+               local_system_inodes = kzalloc(sizeof(struct inode *) *
+                                             NUM_LOCAL_SYSTEM_INODES *
+                                             osb->max_slots,
+                                             GFP_NOFS);
+               if (!local_system_inodes) {
+                       mlog_errno(-ENOMEM);
+                       /*
+                        * return NULL here so that ocfs2_get_sytem_file_inodes
+                        * will try to create an inode and use it. We will try
+                        * to initialize local_system_inodes next time.
+                        */
+                       return NULL;
+               }
+
+               spin_lock(&osb->osb_lock);
+               if (osb->local_system_inodes) {
+                       /* Someone has initialized it for us. */
+                       free = local_system_inodes;
+                       local_system_inodes = osb->local_system_inodes;
+               } else
+                       osb->local_system_inodes = local_system_inodes;
+               spin_unlock(&osb->osb_lock);
+               if (unlikely(free))
+                       kfree(free);
+       }
+
+       index = (slot * NUM_LOCAL_SYSTEM_INODES) +
+               (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
+
+       return &local_system_inodes[index];
 }
 
 struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
        struct inode **arr = NULL;
 
        /* avoid the lookup if cached in local system file array */
-       if (is_in_system_inode_array(osb, type, slot))
-               arr = &(osb->system_inodes[type]);
+       if (is_global_system_inode(type)) {
+               arr = &(osb->global_system_inodes[type]);
+       } else
+               arr = get_local_system_inode(osb, type, slot);
 
        if (arr && ((inode = *arr) != NULL)) {
                /* get a ref in addition to the array ref */
index 63d069b..ae527be 100644 (file)
@@ -2378,6 +2378,8 @@ extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
 
 extern int generic_file_fsync(struct file *, int);
 
+extern int generic_check_addressable(unsigned, u64);
+
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
                                struct page *, struct page *);