Merge branch 'globalheartbeat-2' of git://oss.oracle.com/git/smushran/linux-2.6 into...

author Joel Becker <joel.becker@oracle.com>

Fri, 15 Oct 2010 20:03:09 +0000 (13:03 -0700)

committer Joel Becker <joel.becker@oracle.com>

Fri, 15 Oct 2010 20:03:09 +0000 (13:03 -0700)
author Joel Becker <joel.becker@oracle.com>
Fri, 15 Oct 2010 20:03:09 +0000 (13:03 -0700)
committer Joel Becker <joel.becker@oracle.com>
Fri, 15 Oct 2010 20:03:09 +0000 (13:03 -0700)
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt

index 1f7ae14..5393e66 100644 (file)
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -87,3 +87,10 @@ dir_resv_level=      (*)     By default, directory reservations will scale with file
                         reservations - users should rarely need to change this
                         value. If allocation reservations are turned off, this
                         option will have no effect.
+coherency=full  (*)    Disallow concurrent O_DIRECT writes, cluster inode
+                       lock will be taken to force other nodes drop cache,
+                       therefore full cluster coherency is guaranteed even
+                       for O_DIRECT writes.
+coherency=buffered     Allow concurrent O_DIRECT writes without EX lock among
+                       nodes, which gains high performance at risk of getting
+                       stale data on other nodes.
diff --git a/fs/ext3/super.c b/fs/ext3/super.c

index 5dbf4db..a367dd0 100644 (file)
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1849,8 +1849,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                 goto failed_mount;
         }
  
-       if (le32_to_cpu(es->s_blocks_count) >
-                   (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+       if (generic_check_addressable(sb->s_blocksize_bits,
+                                     le32_to_cpu(es->s_blocks_count))) {
                 ext3_msg(sb, KERN_ERR,
                         "error: filesystem is too large to mount safely");
                 if (sizeof(sector_t) < 8)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index 2614774..7f47c36 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2831,15 +2831,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
          * Test whether we have more sectors than will fit in sector_t,
          * and whether the max offset is addressable by the page cache.
          */
-       if ((ext4_blocks_count(es) >
-            (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
-           (ext4_blocks_count(es) >
-            (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
+       ret = generic_check_addressable(sb->s_blocksize_bits,
+                                       ext4_blocks_count(es));
+       if (ret) {
                 ext4_msg(sb, KERN_ERR, "filesystem"
                          " too large to mount safely on this system");
                 if (sizeof(sector_t) < 8)
                         ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
-               ret = -EFBIG;
                 goto failed_mount;
         }
  
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c

index 0e8014e..262419f 100644 (file)
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1371,6 +1371,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
  
         if (!compat && !ro && !incompat)
                 return 1;
+       /* Load journal superblock if it is not loaded yet. */
+       if (journal->j_format_version == 0 &&
+           journal_get_superblock(journal) != 0)
+               return 0;
         if (journal->j_format_version == 1)
                 return 0;
  
diff --git a/fs/libfs.c b/fs/libfs.c

index 0a9da95..62baa03 100644 (file)
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -913,6 +913,35 @@ int generic_file_fsync(struct file *file, int datasync)
  }
  EXPORT_SYMBOL(generic_file_fsync);
  
+/**
+ * generic_check_addressable - Check addressability of file system
+ * @blocksize_bits:    log of file system block size
+ * @num_blocks:                number of blocks in file system
+ *
+ * Determine whether a file system with @num_blocks blocks (and a
+ * block size of 2**@blocksize_bits) is addressable by the sector_t
+ * and page cache of the system.  Return 0 if so and -EFBIG otherwise.
+ */
+int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
+{
+       u64 last_fs_block = num_blocks - 1;
+       u64 last_fs_page =
+               last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
+
+       if (unlikely(num_blocks == 0))
+               return 0;
+
+       if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
+               return -EINVAL;
+
+       if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
+           (last_fs_page > (pgoff_t)(~0ULL))) {
+               return -EFBIG;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(generic_check_addressable);
+
  /*
   * No-op implementation of ->fsync for in-memory filesystems.
   */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c

index 0de69c9..5cfeee1 100644 (file)
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -883,8 +883,8 @@ struct ocfs2_write_ctxt {
          * out in so that future reads from that region will get
          * zero's.
          */
-       struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
         unsigned int                    w_num_pages;
+       struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
         struct page                     *w_target_page;
  
         /*
@@ -1642,7 +1642,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
         return ret;
  }
  
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+                            struct address_space *mapping,
                              loff_t pos, unsigned len, unsigned flags,
                              struct page **pagep, void **fsdata,
                              struct buffer_head *di_bh, struct page *mmap_page)
@@ -1692,7 +1693,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                 mlog_errno(ret);
                 goto out;
         } else if (ret == 1) {
-               ret = ocfs2_refcount_cow(inode, di_bh,
+               ret = ocfs2_refcount_cow(inode, filp, di_bh,
                                          wc->w_cpos, wc->w_clen, UINT_MAX);
                 if (ret) {
                         mlog_errno(ret);
@@ -1854,7 +1855,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
          */
         down_write(&OCFS2_I(inode)->ip_alloc_sem);
  
-       ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+       ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
                                        fsdata, di_bh, NULL);
         if (ret) {
                 mlog_errno(ret);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h

index c48e93f..7606f66 100644 (file)
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -48,7 +48,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                            loff_t pos, unsigned len, unsigned copied,
                            struct page *page, void *fsdata);
  
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+                            struct address_space *mapping,
                              loff_t pos, unsigned len, unsigned flags,
                              struct page **pagep, void **fsdata,
                              struct buffer_head *di_bh, struct page *mmap_page);
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c

index b4957c7..edaded4 100644 (file)
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -40,6 +40,14 @@
  #include "inode.h"
  #include "super.h"
  
+void ocfs2_dentry_attach_gen(struct dentry *dentry)
+{
+       unsigned long gen =
+               OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+       BUG_ON(dentry->d_inode);
+       dentry->d_fsdata = (void *)gen;
+}
+
  
  static int ocfs2_dentry_revalidate(struct dentry *dentry,
                                    struct nameidata *nd)
@@ -51,11 +59,20 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
         mlog_entry("(0x%p, '%.*s')\n", dentry,
                    dentry->d_name.len, dentry->d_name.name);
  
-       /* Never trust a negative dentry - force a new lookup. */
+       /* For a negative dentry -
+        * check the generation number of the parent and compare with the
+        * one stored in the inode.
+        */
         if (inode == NULL) {
-               mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
-                    dentry->d_name.name);
-               goto bail;
+               unsigned long gen = (unsigned long) dentry->d_fsdata;
+               unsigned long pgen =
+                       OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+               mlog(0, "negative dentry: %.*s parent gen: %lu "
+                       "dentry gen: %lu\n",
+                       dentry->d_name.len, dentry->d_name.name, pgen, gen);
+               if (gen != pgen)
+                       goto bail;
+               goto valid;
         }
  
         BUG_ON(!osb);
@@ -96,6 +113,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
                 goto bail;
         }
  
+valid:
         ret = 1;
  
  bail:
@@ -227,6 +245,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
         if (!inode)
                 return 0;
  
+       if (!dentry->d_inode && dentry->d_fsdata) {
+               /* Converting a negative dentry to positive
+                  Clear dentry->d_fsdata */
+               dentry->d_fsdata = dl = NULL;
+       }
+
         if (dl) {
                 mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
                                 " \"%.*s\": old parent: %llu, new: %llu\n",
@@ -452,6 +476,7 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
  
  out:
         iput(inode);
+       ocfs2_dentry_attach_gen(dentry);
  }
  
  /*
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h

index f5dd178..b79eff7 100644 (file)
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
                        struct inode *old_dir, struct inode *new_dir);
  
  extern spinlock_t dentry_attach_lock;
+void ocfs2_dentry_attach_gen(struct dentry *dentry);
  
  #endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c

index f693ab8..272ec86 100644 (file)
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -493,7 +493,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
         struct hlist_head *bucket;
         struct hlist_node *list;
         int i, out = 0;
-       unsigned long total = 0, longest = 0, bktcnt;
+       unsigned long total = 0, longest = 0, bucket_count = 0;
  
         out += snprintf(db->buf + out, db->len - out,
                         "Dumping MLEs for Domain: %s\n", dlm->name);
@@ -505,13 +505,13 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                         mle = hlist_entry(list, struct dlm_master_list_entry,
                                           master_hash_node);
                         ++total;
-                       ++bktcnt;
+                       ++bucket_count;
                         if (db->len - out < 200)
                                 continue;
                         out += dump_mle(mle, db->buf + out, db->len - out);
                 }
-               longest = max(longest, bktcnt);
-               bktcnt = 0;
+               longest = max(longest, bucket_count);
+               bucket_count = 0;
         }
         spin_unlock(&dlm->master_lock);
  
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c

index 5e02a89..e8d94d7 100644 (file)
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
  {
         struct inode *inode;
         struct address_space *mapping;
+       struct ocfs2_inode_info *oi;
  
                 inode = ocfs2_lock_res_inode(lockres);
         mapping = inode->i_mapping;
  
+       if (S_ISDIR(inode->i_mode)) {
+               oi = OCFS2_I(inode);
+               oi->ip_dir_lock_gen++;
+               mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
+               goto out;
+       }
+
         if (!S_ISREG(inode->i_mode))
                 goto out;
  
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c

index 9a03c15..9e8cc43 100644 (file)
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -64,12 +64,6 @@
  
  #include "buffer_head_io.h"
  
-static int ocfs2_sync_inode(struct inode *inode)
-{
-       filemap_fdatawrite(inode->i_mapping);
-       return sync_mapping_buffers(inode->i_mapping);
-}
-
  static int ocfs2_init_file_private(struct inode *inode, struct file *file)
  {
         struct ocfs2_file_private *fp;
@@ -180,16 +174,12 @@ static int ocfs2_sync_file(struct file *file, int datasync)
  {
         int err = 0;
         journal_t *journal;
-       struct dentry *dentry = file->f_path.dentry;
         struct inode *inode = file->f_mapping->host;
         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  
-       mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
-                  dentry->d_name.len, dentry->d_name.name);
-
-       err = ocfs2_sync_inode(dentry->d_inode);
-       if (err)
-               goto bail;
+       mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
+                  file->f_path.dentry, file->f_path.dentry->d_name.len,
+                  file->f_path.dentry->d_name.name);
  
         if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
                 /*
@@ -370,7 +360,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
         if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
                 goto out;
  
-       return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+       return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
  
  out:
         return status;
@@ -913,8 +903,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
                 zero_clusters = last_cpos - zero_cpos;
  
         if (needs_cow) {
-               rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
-                                       UINT_MAX);
+               rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
+                                       zero_clusters, UINT_MAX);
                 if (rc) {
                         mlog_errno(rc);
                         goto out;
@@ -2062,6 +2052,7 @@ out:
  }
  
  static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+                                           struct file *file,
                                             loff_t pos, size_t count,
                                             int *meta_level)
  {
@@ -2079,7 +2070,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
  
         *meta_level = 1;
  
-       ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
+       ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
         if (ret)
                 mlog_errno(ret);
  out:
@@ -2087,7 +2078,7 @@ out:
         return ret;
  }
  
-static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
+static int ocfs2_prepare_inode_for_write(struct file *file,
                                          loff_t *ppos,
                                          size_t count,
                                          int appending,
@@ -2095,6 +2086,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                          int *has_refcount)
  {
         int ret = 0, meta_level = 0;
+       struct dentry *dentry = file->f_path.dentry;
         struct inode *inode = dentry->d_inode;
         loff_t saved_pos, end;
  
@@ -2150,6 +2142,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                         meta_level = -1;
  
                         ret = ocfs2_prepare_inode_for_refcount(inode,
+                                                              file,
                                                                saved_pos,
                                                                count,
                                                                &meta_level);
@@ -2232,6 +2225,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_path.dentry->d_inode;
         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       int full_coherency = !(osb->s_mount_opt &
+                              OCFS2_MOUNT_COHERENCY_BUFFERED);
  
         mlog_entry("(0x%p, %u, '%.*s')\n", file,
                    (unsigned int)nr_segs,
@@ -2255,16 +2250,39 @@ relock:
                 have_alloc_sem = 1;
         }
  
-       /* concurrent O_DIRECT writes are allowed */
-       rw_level = !direct_io;
+       /*
+        * Concurrent O_DIRECT writes are allowed with
+        * mount_option "coherency=buffered".
+        */
+       rw_level = (!direct_io || full_coherency);
+
         ret = ocfs2_rw_lock(inode, rw_level);
         if (ret < 0) {
                 mlog_errno(ret);
                 goto out_sems;
         }
  
+       /*
+        * O_DIRECT writes with "coherency=full" need to take EX cluster
+        * inode_lock to guarantee coherency.
+        */
+       if (direct_io && full_coherency) {
+               /*
+                * We need to take and drop the inode lock to force
+                * other nodes to drop their caches.  Buffered I/O
+                * already does this in write_begin().
+                */
+               ret = ocfs2_inode_lock(inode, NULL, 1);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out_sems;
+               }
+
+               ocfs2_inode_unlock(inode, 1);
+       }
+
         can_do_direct = direct_io;
-       ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+       ret = ocfs2_prepare_inode_for_write(file, ppos,
                                             iocb->ki_left, appending,
                                             &can_do_direct, &has_refcount);
         if (ret < 0) {
@@ -2312,17 +2330,6 @@ relock:
                 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
                                                     ppos, count, ocount);
                 if (written < 0) {
-                       /*
-                        * direct write may have instantiated a few
-                        * blocks outside i_size. Trim these off again.
-                        * Don't need i_size_read because we hold i_mutex.
-                        *
-                        * XXX(truncate): this looks buggy because ocfs2 did not
-                        * actually implement ->truncate.  Take a look at
-                        * the new truncate sequence and update this accordingly
-                        */
-                       if (*ppos + count > inode->i_size)
-                               truncate_setsize(inode, inode->i_size);
                         ret = written;
                         goto out_dio;
                 }
@@ -2394,7 +2401,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
  {
         int ret;
  
-       ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
+       ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
                                             sd->total_len, 0, NULL, NULL);
         if (ret < 0) {
                 mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c

index eece3e0..f935fd6 100644 (file)
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                     else
                             inode->i_fop = &ocfs2_dops_no_plocks;
                     i_size_write(inode, le64_to_cpu(fe->i_size));
+                   OCFS2_I(inode)->ip_dir_lock_gen = 1;
                     break;
             case S_IFLNK:
                     if (ocfs2_inode_is_fast_symlink(inode))
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h

index 6de5a86..1c508b1 100644 (file)
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,30 +46,28 @@ struct ocfs2_inode_info
         /* These fields are protected by ip_lock */
         spinlock_t                      ip_lock;
         u32                             ip_open_count;
-       u32                             ip_clusters;
         struct list_head                ip_io_markers;
+       u32                             ip_clusters;
  
+       u16                             ip_dyn_features;
         struct mutex                    ip_io_mutex;
-
         u32                             ip_flags; /* see below */
         u32                             ip_attr; /* inode attributes */
-       u16                             ip_dyn_features;
  
         /* protected by recovery_lock. */
         struct inode                    *ip_next_orphan;
  
-       u32                             ip_dir_start_lookup;
-
         struct ocfs2_caching_info       ip_metadata_cache;
-
         struct ocfs2_extent_map         ip_extent_map;
-
         struct inode                    vfs_inode;
         struct jbd2_inode               ip_jinode;
  
+       u32                             ip_dir_start_lookup;
+
         /* Only valid if the inode is the dir. */
         u32                             ip_last_used_slot;
         u64                             ip_last_used_group;
+       u32                             ip_dir_lock_gen;
  
         struct ocfs2_alloc_reservation  ip_la_data_resv;
  };
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c

index 7d9d9c1..7a48681 100644 (file)
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -26,6 +26,26 @@
  
  #include <linux/ext2_fs.h>
  
+#define o2info_from_user(a, b) \
+               copy_from_user(&(a), (b), sizeof(a))
+#define o2info_to_user(a, b)   \
+               copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
+
+/*
+ * This call is void because we are already reporting an error that may
+ * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
+ * just a best-effort to tell userspace that this request caused the error.
+ */
+static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+                                       struct ocfs2_info_request __user *req)
+{
+       kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
+       (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
+}
+
+#define o2info_set_request_error(a, b) \
+               __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
+
  static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
  {
         int status;
@@ -109,6 +129,328 @@ bail:
         return status;
  }
  
+int ocfs2_info_handle_blocksize(struct inode *inode,
+                               struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_blocksize oib;
+
+       if (o2info_from_user(oib, req))
+               goto bail;
+
+       oib.ib_blocksize = inode->i_sb->s_blocksize;
+       oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oib, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oib, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_clustersize(struct inode *inode,
+                                 struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_clustersize oic;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (o2info_from_user(oic, req))
+               goto bail;
+
+       oic.ic_clustersize = osb->s_clustersize;
+       oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oic, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oic, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_maxslots(struct inode *inode,
+                              struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_maxslots oim;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (o2info_from_user(oim, req))
+               goto bail;
+
+       oim.im_max_slots = osb->max_slots;
+       oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oim, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oim, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_label(struct inode *inode,
+                           struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_label oil;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (o2info_from_user(oil, req))
+               goto bail;
+
+       memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
+       oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oil, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oil, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_uuid(struct inode *inode,
+                          struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_uuid oiu;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (o2info_from_user(oiu, req))
+               goto bail;
+
+       memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
+       oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oiu, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oiu, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_fs_features(struct inode *inode,
+                                 struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_fs_features oif;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (o2info_from_user(oif, req))
+               goto bail;
+
+       oif.if_compat_features = osb->s_feature_compat;
+       oif.if_incompat_features = osb->s_feature_incompat;
+       oif.if_ro_compat_features = osb->s_feature_ro_compat;
+       oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oif, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oif, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_journal_size(struct inode *inode,
+                                  struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_journal_size oij;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (o2info_from_user(oij, req))
+               goto bail;
+
+       oij.ij_journal_size = osb->journal->j_inode->i_size;
+
+       oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oij, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oij, req);
+
+       return status;
+}
+
+int ocfs2_info_handle_unknown(struct inode *inode,
+                             struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_request oir;
+
+       if (o2info_from_user(oir, req))
+               goto bail;
+
+       oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
+
+       if (o2info_to_user(oir, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(oir, req);
+
+       return status;
+}
+
+/*
+ * Validate and distinguish OCFS2_IOC_INFO requests.
+ *
+ * - validate the magic number.
+ * - distinguish different requests.
+ * - validate size of different requests.
+ */
+int ocfs2_info_handle_request(struct inode *inode,
+                             struct ocfs2_info_request __user *req)
+{
+       int status = -EFAULT;
+       struct ocfs2_info_request oir;
+
+       if (o2info_from_user(oir, req))
+               goto bail;
+
+       status = -EINVAL;
+       if (oir.ir_magic != OCFS2_INFO_MAGIC)
+               goto bail;
+
+       switch (oir.ir_code) {
+       case OCFS2_INFO_BLOCKSIZE:
+               if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
+                       status = ocfs2_info_handle_blocksize(inode, req);
+               break;
+       case OCFS2_INFO_CLUSTERSIZE:
+               if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
+                       status = ocfs2_info_handle_clustersize(inode, req);
+               break;
+       case OCFS2_INFO_MAXSLOTS:
+               if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
+                       status = ocfs2_info_handle_maxslots(inode, req);
+               break;
+       case OCFS2_INFO_LABEL:
+               if (oir.ir_size == sizeof(struct ocfs2_info_label))
+                       status = ocfs2_info_handle_label(inode, req);
+               break;
+       case OCFS2_INFO_UUID:
+               if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
+                       status = ocfs2_info_handle_uuid(inode, req);
+               break;
+       case OCFS2_INFO_FS_FEATURES:
+               if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
+                       status = ocfs2_info_handle_fs_features(inode, req);
+               break;
+       case OCFS2_INFO_JOURNAL_SIZE:
+               if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
+                       status = ocfs2_info_handle_journal_size(inode, req);
+               break;
+       default:
+               status = ocfs2_info_handle_unknown(inode, req);
+               break;
+       }
+
+bail:
+       return status;
+}
+
+int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+                         u64 *req_addr, int compat_flag)
+{
+       int status = -EFAULT;
+       u64 __user *bp = NULL;
+
+       if (compat_flag) {
+#ifdef CONFIG_COMPAT
+               /*
+                * pointer bp stores the base address of a pointers array,
+                * which collects all addresses of separate request.
+                */
+               bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
+#else
+               BUG();
+#endif
+       } else
+               bp = (u64 __user *)(unsigned long)(info->oi_requests);
+
+       if (o2info_from_user(*req_addr, bp + idx))
+               goto bail;
+
+       status = 0;
+bail:
+       return status;
+}
+
+/*
+ * OCFS2_IOC_INFO handles an array of requests passed from userspace.
+ *
+ * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * validate the request count from header, then break it into small
+ * pieces, later specific handlers can handle them one by one.
+ *
+ * Idea here is to make each separate request small enough to ensure
+ * a better backward&forward compatibility, since a small piece of
+ * request will be less likely to be broken if disk layout get changed.
+ */
+int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+                     int compat_flag)
+{
+       int i, status = 0;
+       u64 req_addr;
+       struct ocfs2_info_request __user *reqp;
+
+       if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
+           (!info->oi_requests)) {
+               status = -EINVAL;
+               goto bail;
+       }
+
+       for (i = 0; i < info->oi_count; i++) {
+
+               status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
+               if (status)
+                       break;
+
+               reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
+               if (!reqp) {
+                       status = -EINVAL;
+                       goto bail;
+               }
+
+               status = ocfs2_info_handle_request(inode, reqp);
+               if (status)
+                       break;
+       }
+
+bail:
+       return status;
+}
+
  long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
  {
         struct inode *inode = filp->f_path.dentry->d_inode;
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
         struct reflink_arguments args;
         const char *old_path, *new_path;
         bool preserve;
+       struct ocfs2_info info;
  
         switch (cmd) {
         case OCFS2_IOC_GETFLAGS:
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                 preserve = (args.preserve != 0);
  
                 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+       case OCFS2_IOC_INFO:
+               if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+                                  sizeof(struct ocfs2_info)))
+                       return -EFAULT;
+
+               return ocfs2_info_handle(inode, &info, 0);
         default:
                 return -ENOTTY;
         }
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
         bool preserve;
         struct reflink_arguments args;
         struct inode *inode = file->f_path.dentry->d_inode;
+       struct ocfs2_info info;
  
         switch (cmd) {
         case OCFS2_IOC32_GETFLAGS:
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
  
                 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
                                            compat_ptr(args.new_path), preserve);
+       case OCFS2_IOC_INFO:
+               if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+                                  sizeof(struct ocfs2_info)))
+                       return -EFAULT;
+
+               return ocfs2_info_handle(inode, &info, 1);
         default:
                 return -ENOIOCTLCMD;
         }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c

index 9b57c03..faa2303 100644 (file)
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
  {
         int status = 0;
         unsigned int flushed;
-       unsigned long old_id;
         struct ocfs2_journal *journal = NULL;
  
         mlog_entry_void();
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
                 goto finally;
         }
  
-       old_id = ocfs2_inc_trans_id(journal);
+       ocfs2_inc_trans_id(journal);
  
         flushed = atomic_read(&journal->j_num_trans);
         atomic_set(&journal->j_num_trans, 0);
@@ -342,9 +341,6 @@ finally:
         return status;
  }
  
-/* pass it NULL and it will allocate a new handle object for you.  If
- * you pass it a handle however, it may still return error, in which
- * case it has free'd the passed handle for you. */
  handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
  {
         journal_t *journal = osb->journal->j_journal;
@@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
  
         os = &osb->osb_orphan_scan;
  
+       mlog(0, "Begin orphan scan\n");
+
         if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
                 goto out;
  
@@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
  unlock:
         ocfs2_orphan_scan_unlock(osb, seqno);
  out:
+       mlog(0, "Orphan scan completed\n");
         return;
  }
  
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h

index b5baaa8..43e56b9 100644 (file)
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
         struct buffer_head        *j_bh;      /* Journal disk inode block */
         atomic_t                  j_num_trans; /* Number of transactions
                                                 * currently in the system. */
+       spinlock_t                j_lock;
         unsigned long             j_trans_id;
         struct rw_semaphore       j_trans_barrier;
         wait_queue_head_t         j_checkpointed;
  
-       spinlock_t                j_lock;
+       /* both fields protected by j_lock*/
         struct list_head          j_la_cleanups;
         struct work_struct        j_recovery_work;
  };
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c

index 4c18f4a..7e32db9 100644 (file)
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
         return ret;
  }
  
-static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
                                 struct page *page)
  {
         int ret;
+       struct inode *inode = file->f_path.dentry->d_inode;
         struct address_space *mapping = inode->i_mapping;
         loff_t pos = page_offset(page);
         unsigned int len = PAGE_CACHE_SIZE;
@@ -111,7 +112,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
         if (page->index == last_index)
                 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
  
-       ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+       ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
                                        &fsdata, di_bh, page);
         if (ret) {
                 if (ret != -ENOSPC)
@@ -159,7 +160,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
          */
         down_write(&OCFS2_I(inode)->ip_alloc_sem);
  
-       ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+       ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
  
         up_write(&OCFS2_I(inode)->ip_alloc_sem);
  
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c

index a00dda2..e7bde21 100644 (file)
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -171,7 +171,8 @@ bail_add:
                         ret = ERR_PTR(status);
                         goto bail_unlock;
                 }
-       }
+       } else
+               ocfs2_dentry_attach_gen(dentry);
  
  bail_unlock:
         /* Don't drop the cluster lock until *after* the d_add --
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h

index 481387b..d840821 100644 (file)
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
  struct ocfs2_lock_res {
         void                    *l_priv;
         struct ocfs2_lock_res_ops *l_ops;
-       spinlock_t               l_lock;
+
  
         struct list_head         l_blocked_list;
         struct list_head         l_mask_waiters;
  
-       enum ocfs2_lock_type     l_type;
         unsigned long            l_flags;
         char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
-       int                      l_level;
         unsigned int             l_ro_holders;
         unsigned int             l_ex_holders;
-       struct ocfs2_dlm_lksb    l_lksb;
+       unsigned char            l_level;
+
+       /* Data packed - type enum ocfs2_lock_type */
+       unsigned char            l_type;
  
         /* used from AST/BAST funcs. */
-       enum ocfs2_ast_action    l_action;
-       enum ocfs2_unlock_action l_unlock_action;
-       int                      l_requested;
-       int                      l_blocking;
+       /* Data packed - enum type ocfs2_ast_action */
+       unsigned char            l_action;
+       /* Data packed - enum type ocfs2_unlock_action */
+       unsigned char            l_unlock_action;
+       unsigned char            l_requested;
+       unsigned char            l_blocking;
         unsigned int             l_pending_gen;
  
+       spinlock_t               l_lock;
+
+       struct ocfs2_dlm_lksb    l_lksb;
+
         wait_queue_head_t        l_event;
  
         struct list_head         l_debug_list;
@@ -256,8 +263,10 @@ enum ocfs2_mount_options
                                                    control lists */
         OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
         OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
-       OCFS2_MOUNT_HB_NONE = 1 << 12, /* No heartbeat */
-       OCFS2_MOUNT_HB_GLOBAL = 1 << 13, /* Global heartbeat */
+       OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
+                                                    writes */
+       OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
+       OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
  };
  
  #define OCFS2_OSB_SOFT_RO                      0x0001
@@ -279,7 +288,8 @@ struct ocfs2_super
         struct super_block *sb;
         struct inode *root_inode;
         struct inode *sys_root_inode;
-       struct inode *system_inodes[NUM_SYSTEM_INODES];
+       struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
+       struct inode **local_system_inodes;
  
         struct ocfs2_slot_info *slot_info;
  
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h

index 28ff536..c2e4f82 100644 (file)
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -338,6 +338,7 @@ enum {
         USER_QUOTA_SYSTEM_INODE,
         GROUP_QUOTA_SYSTEM_INODE,
  #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
+#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
         ORPHAN_DIR_SYSTEM_INODE,
         EXTENT_ALLOC_SYSTEM_INODE,
         INODE_ALLOC_SYSTEM_INODE,
@@ -346,8 +347,12 @@ enum {
         TRUNCATE_LOG_SYSTEM_INODE,
         LOCAL_USER_QUOTA_SYSTEM_INODE,
         LOCAL_GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
         NUM_SYSTEM_INODES
  };
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
+#define NUM_LOCAL_SYSTEM_INODES        \
+               (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
  
  static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
         /* Global system inodes (single copy) */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h

index 5d24150..b46f39b 100644 (file)
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -76,4 +76,99 @@ struct reflink_arguments {
  };
  #define OCFS2_IOC_REFLINK      _IOW('o', 4, struct reflink_arguments)
  
+/* Following definitions dedicated for ocfs2_info_request ioctls. */
+#define OCFS2_INFO_MAX_REQUEST         (50)
+#define OCFS2_TEXT_UUID_LEN            (OCFS2_VOL_UUID_LEN * 2)
+
+/* Magic number of all requests */
+#define OCFS2_INFO_MAGIC               (0x4F32494E)
+
+/*
+ * Always try to separate info request into small pieces to
+ * guarantee the backward&forward compatibility.
+ */
+struct ocfs2_info {
+       __u64 oi_requests;      /* Array of __u64 pointers to requests */
+       __u32 oi_count;         /* Number of requests in info_requests */
+       __u32 oi_pad;
+};
+
+struct ocfs2_info_request {
+/*00*/ __u32 ir_magic; /* Magic number */
+       __u32 ir_code;  /* Info request code */
+       __u32 ir_size;  /* Size of request */
+       __u32 ir_flags; /* Request flags */
+/*10*/                 /* Request specific fields */
+};
+
+struct ocfs2_info_clustersize {
+       struct ocfs2_info_request ic_req;
+       __u32 ic_clustersize;
+       __u32 ic_pad;
+};
+
+struct ocfs2_info_blocksize {
+       struct ocfs2_info_request ib_req;
+       __u32 ib_blocksize;
+       __u32 ib_pad;
+};
+
+struct ocfs2_info_maxslots {
+       struct ocfs2_info_request im_req;
+       __u32 im_max_slots;
+       __u32 im_pad;
+};
+
+struct ocfs2_info_label {
+       struct ocfs2_info_request il_req;
+       __u8    il_label[OCFS2_MAX_VOL_LABEL_LEN];
+} __attribute__ ((packed));
+
+struct ocfs2_info_uuid {
+       struct ocfs2_info_request iu_req;
+       __u8    iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
+} __attribute__ ((packed));
+
+struct ocfs2_info_fs_features {
+       struct ocfs2_info_request if_req;
+       __u32 if_compat_features;
+       __u32 if_incompat_features;
+       __u32 if_ro_compat_features;
+       __u32 if_pad;
+};
+
+struct ocfs2_info_journal_size {
+       struct ocfs2_info_request ij_req;
+       __u64 ij_journal_size;
+};
+
+/* Codes for ocfs2_info_request */
+enum ocfs2_info_type {
+       OCFS2_INFO_CLUSTERSIZE = 1,
+       OCFS2_INFO_BLOCKSIZE,
+       OCFS2_INFO_MAXSLOTS,
+       OCFS2_INFO_LABEL,
+       OCFS2_INFO_UUID,
+       OCFS2_INFO_FS_FEATURES,
+       OCFS2_INFO_JOURNAL_SIZE,
+       OCFS2_INFO_NUM_TYPES
+};
+
+/* Flags for struct ocfs2_info_request */
+/* Filled by the caller */
+#define OCFS2_INFO_FL_NON_COHERENT     (0x00000001)    /* Cluster coherency not
+                                                          required. This is a hint.
+                                                          It is up to ocfs2 whether
+                                                          the request can be fulfilled
+                                                          without locking. */
+/* Filled by ocfs2 */
+#define OCFS2_INFO_FL_FILLED           (0x40000000)    /* Filesystem understood
+                                                          this request and
+                                                          filled in the answer */
+
+#define OCFS2_INFO_FL_ERROR            (0x80000000)    /* Error happened during
+                                                          request handling. */
+
+#define OCFS2_IOC_INFO         _IOR('o', 5, struct ocfs2_info)
+
  #endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c

index efdd756..b5f9160 100644 (file)
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,6 +49,7 @@
  
  struct ocfs2_cow_context {
         struct inode *inode;
+       struct file *file;
         u32 cow_start;
         u32 cow_len;
         struct ocfs2_extent_tree data_et;
@@ -2932,13 +2933,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
         u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
         struct page *page;
         pgoff_t page_index;
-       unsigned int from, to;
+       unsigned int from, to, readahead_pages;
         loff_t offset, end, map_end;
         struct address_space *mapping = context->inode->i_mapping;
  
         mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
              new_cluster, new_len, cpos);
  
+       readahead_pages =
+               (ocfs2_cow_contig_clusters(sb) <<
+                OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
         offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
         end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
         /*
@@ -2969,6 +2973,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
                         BUG_ON(PageDirty(page));
  
+               if (PageReadahead(page) && context->file) {
+                       page_cache_async_readahead(mapping,
+                                                  &context->file->f_ra,
+                                                  context->file,
+                                                  page, page_index,
+                                                  readahead_pages);
+               }
+
                 if (!PageUptodate(page)) {
                         ret = block_read_full_page(page, ocfs2_get_block);
                         if (ret) {
@@ -3409,12 +3421,35 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
         return ret;
  }
  
+static void ocfs2_readahead_for_cow(struct inode *inode,
+                                   struct file *file,
+                                   u32 start, u32 len)
+{
+       struct address_space *mapping;
+       pgoff_t index;
+       unsigned long num_pages;
+       int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+
+       if (!file)
+               return;
+
+       mapping = file->f_mapping;
+       num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
+       if (!num_pages)
+               num_pages = 1;
+
+       index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
+       page_cache_sync_readahead(mapping, &file->f_ra, file,
+                                 index, num_pages);
+}
+
  /*
   * Starting at cpos, try to CoW write_len clusters.  Don't CoW
   * past max_cpos.  This will stop when it runs into a hole or an
   * unrefcounted extent.
   */
  static int ocfs2_refcount_cow_hunk(struct inode *inode,
+                                  struct file *file,
                                    struct buffer_head *di_bh,
                                    u32 cpos, u32 write_len, u32 max_cpos)
  {
@@ -3443,6 +3478,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
  
         BUG_ON(cow_len == 0);
  
+       ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
+
         context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
         if (!context) {
                 ret = -ENOMEM;
@@ -3464,6 +3501,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
         context->ref_root_bh = ref_root_bh;
         context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
         context->get_clusters = ocfs2_di_get_clusters;
+       context->file = file;
  
         ocfs2_init_dinode_extent_tree(&context->data_et,
                                       INODE_CACHE(inode), di_bh);
@@ -3492,6 +3530,7 @@ out:
   * clusters between cpos and cpos+write_len are safe to modify.
   */
  int ocfs2_refcount_cow(struct inode *inode,
+                      struct file *file,
                        struct buffer_head *di_bh,
                        u32 cpos, u32 write_len, u32 max_cpos)
  {
@@ -3511,7 +3550,7 @@ int ocfs2_refcount_cow(struct inode *inode,
                         num_clusters = write_len;
  
                 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
-                       ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
+                       ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
                                                       num_clusters, max_cpos);
                         if (ret) {
                                 mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h

index 9983ba1..c8ce46f 100644 (file)
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
         struct rb_node rf_node;
         u64 rf_blkno;
         u32 rf_generation;
+       struct kref rf_getcnt;
         struct rw_semaphore rf_sem;
         struct ocfs2_lock_res rf_lockres;
-       struct kref rf_getcnt;
         int rf_removed;
  
         /* the following 4 fields are used by caching_info. */
-       struct ocfs2_caching_info rf_ci;
         spinlock_t rf_lock;
+       struct ocfs2_caching_info rf_ci;
         struct mutex rf_io_mutex;
         struct super_block *rf_sb;
  };
@@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
                                           u32 clusters,
                                           int *credits,
                                           int *ref_blocks);
-int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_refcount_cow(struct inode *inode,
+                      struct file *filep, struct buffer_head *di_bh,
                        u32 cpos, u32 write_len, u32 max_cpos);
  
  typedef int (ocfs2_post_refcount_func)(struct inode *inode,
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c

index bfbd7e9..ab4e017 100644 (file)
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
  {
         int status = 0;
         u64 blkno;
-       unsigned long long blocks, bytes;
+       unsigned long long blocks, bytes = 0;
         unsigned int i;
         struct buffer_head *bh;
  
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c

index 849c2f0..5fed60d 100644 (file)
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1380,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
         }
  
         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+       if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+               ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+                           " count %u but claims %u are freed. num_bits %d",
+                           (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                           le16_to_cpu(bg->bg_bits),
+                           le16_to_cpu(bg->bg_free_bits_count), num_bits);
+               return -EROFS;
+       }
         while(num_bits--)
                 ocfs2_set_bit(bit_off++, bitmap);
  
@@ -2419,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
                                 (unsigned long *) undo_bg->bg_bitmap);
         }
         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+       if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+               ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+                           " count %u but claims %u are freed. num_bits %d",
+                           (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                           le16_to_cpu(bg->bg_bits),
+                           le16_to_cpu(bg->bg_free_bits_count), num_bits);
+               return -EROFS;
+       }
  
         if (undo_fn)
                 jbd_unlock_bh_state(group_bh);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c

index 4e009ad..a8a0ca4 100644 (file)
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -178,6 +178,8 @@ enum {
         Opt_noacl,
         Opt_usrquota,
         Opt_grpquota,
+       Opt_coherency_buffered,
+       Opt_coherency_full,
         Opt_resv_level,
         Opt_dir_resv_level,
         Opt_err,
@@ -207,6 +209,8 @@ static const match_table_t tokens = {
         {Opt_noacl, "noacl"},
         {Opt_usrquota, "usrquota"},
         {Opt_grpquota, "grpquota"},
+       {Opt_coherency_buffered, "coherency=buffered"},
+       {Opt_coherency_full, "coherency=full"},
         {Opt_resv_level, "resv_level=%u"},
         {Opt_dir_resv_level, "dir_resv_level=%u"},
         {Opt_err, NULL}
@@ -516,11 +520,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
  
         mlog_entry_void();
  
-       for (i = 0; i < NUM_SYSTEM_INODES; i++) {
-               inode = osb->system_inodes[i];
+       for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
+               inode = osb->global_system_inodes[i];
                 if (inode) {
                         iput(inode);
-                       osb->system_inodes[i] = NULL;
+                       osb->global_system_inodes[i] = NULL;
                 }
         }
  
@@ -536,6 +540,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
                 osb->root_inode = NULL;
         }
  
+       if (!osb->local_system_inodes)
+               goto out;
+
+       for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
+               if (osb->local_system_inodes[i]) {
+                       iput(osb->local_system_inodes[i]);
+                       osb->local_system_inodes[i] = NULL;
+               }
+       }
+
+       kfree(osb->local_system_inodes);
+       osb->local_system_inodes = NULL;
+
+out:
         mlog_exit(0);
  }
  
@@ -1452,6 +1470,12 @@ static int ocfs2_parse_options(struct super_block *sb,
                 case Opt_grpquota:
                         mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
                         break;
+               case Opt_coherency_buffered:
+                       mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+                       break;
+               case Opt_coherency_full:
+                       mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+                       break;
                 case Opt_acl:
                         mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
                         mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1563,6 +1587,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
         if (opts & OCFS2_MOUNT_GRPQUOTA)
                 seq_printf(s, ",grpquota");
  
+       if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+               seq_printf(s, ",coherency=buffered");
+       else
+               seq_printf(s, ",coherency=full");
+
         if (opts & OCFS2_MOUNT_NOUSERXATTR)
                 seq_printf(s, ",nouser_xattr");
         else
@@ -2017,6 +2046,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
         return 0;
  }
  
+/* Make sure entire volume is addressable by our journal.  Requires
+   osb_clusters_at_boot to be valid and for the journal to have been
+   initialized by ocfs2_journal_init(). */
+static int ocfs2_journal_addressable(struct ocfs2_super *osb)
+{
+       int status = 0;
+       u64 max_block =
+               ocfs2_clusters_to_blocks(osb->sb,
+                                        osb->osb_clusters_at_boot) - 1;
+
+       /* 32-bit block number is always OK. */
+       if (max_block <= (u32)~0ULL)
+               goto out;
+
+       /* Volume is "huge", so see if our journal is new enough to
+          support it. */
+       if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+                                      OCFS2_FEATURE_COMPAT_JBD2_SB) &&
+             jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
+                                              JBD2_FEATURE_INCOMPAT_64BIT))) {
+               mlog(ML_ERROR, "The journal cannot address the entire volume. "
+                    "Enable the 'block64' journal option with tunefs.ocfs2");
+               status = -EFBIG;
+               goto out;
+       }
+
+ out:
+       return status;
+}
+
  static int ocfs2_initialize_super(struct super_block *sb,
                                   struct buffer_head *bh,
                                   int sector_size,
@@ -2029,6 +2088,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
         struct ocfs2_journal *journal;
         __le32 uuid_net_key;
         struct ocfs2_super *osb;
+       u64 total_blocks;
  
         mlog_entry_void();
  
@@ -2087,6 +2147,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
         snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
                  MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
  
+       osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+       if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+               mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+                    osb->max_slots);
+               status = -EINVAL;
+               goto bail;
+       }
+       mlog(0, "max_slots for this device: %u\n", osb->max_slots);
+
         ocfs2_orphan_scan_init(osb);
  
         status = ocfs2_recovery_init(osb);
@@ -2125,15 +2194,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
                 goto bail;
         }
  
-       osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
-       if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
-               mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
-                    osb->max_slots);
-               status = -EINVAL;
-               goto bail;
-       }
-       mlog(0, "max_slots for this device: %u\n", osb->max_slots);
-
         osb->slot_recovery_generations =
                 kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
                         GFP_KERNEL);
@@ -2243,11 +2303,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
                 goto bail;
         }
  
-       if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
-           > (u32)~0UL) {
-               mlog(ML_ERROR, "Volume might try to write to blocks beyond "
-                    "what jbd can address in 32 bits.\n");
-               status = -EINVAL;
+       total_blocks = ocfs2_clusters_to_blocks(osb->sb,
+                                               le32_to_cpu(di->i_clusters));
+
+       status = generic_check_addressable(osb->sb->s_blocksize_bits,
+                                          total_blocks);
+       if (status) {
+               mlog(ML_ERROR, "Volume too large "
+                    "to mount safely on this system");
+               status = -EFBIG;
                 goto bail;
         }
  
@@ -2409,6 +2473,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
                 goto finally;
         }
  
+       /* Now that journal has been initialized, check to make sure
+          entire volume is addressable. */
+       status = ocfs2_journal_addressable(osb);
+       if (status)
+               goto finally;
+
         /* If the journal was unmounted cleanly then we don't want to
          * recover anything. Otherwise, journal_load will do that
          * dirty work for us :) */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c

index bfe7190..902efb2 100644 (file)
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
                                                    int type,
                                                    u32 slot);
  
-static inline int is_global_system_inode(int type);
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
-                                          int type,
-                                          u32 slot);
-
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
  static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
  #endif
@@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type)
                 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
  }
  
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
-                                          int type,
-                                          u32 slot)
+static struct inode **get_local_system_inode(struct ocfs2_super *osb,
+                                            int type,
+                                            u32 slot)
  {
-       return slot == osb->slot_num || is_global_system_inode(type);
+       int index;
+       struct inode **local_system_inodes, **free = NULL;
+
+       BUG_ON(slot == OCFS2_INVALID_SLOT);
+       BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
+              type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
+
+       spin_lock(&osb->osb_lock);
+       local_system_inodes = osb->local_system_inodes;
+       spin_unlock(&osb->osb_lock);
+
+       if (unlikely(!local_system_inodes)) {
+               local_system_inodes = kzalloc(sizeof(struct inode *) *
+                                             NUM_LOCAL_SYSTEM_INODES *
+                                             osb->max_slots,
+                                             GFP_NOFS);
+               if (!local_system_inodes) {
+                       mlog_errno(-ENOMEM);
+                       /*
+                        * return NULL here so that ocfs2_get_sytem_file_inodes
+                        * will try to create an inode and use it. We will try
+                        * to initialize local_system_inodes next time.
+                        */
+                       return NULL;
+               }
+
+               spin_lock(&osb->osb_lock);
+               if (osb->local_system_inodes) {
+                       /* Someone has initialized it for us. */
+                       free = local_system_inodes;
+                       local_system_inodes = osb->local_system_inodes;
+               } else
+                       osb->local_system_inodes = local_system_inodes;
+               spin_unlock(&osb->osb_lock);
+               if (unlikely(free))
+                       kfree(free);
+       }
+
+       index = (slot * NUM_LOCAL_SYSTEM_INODES) +
+               (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
+
+       return &local_system_inodes[index];
  }
  
  struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
         struct inode **arr = NULL;
  
         /* avoid the lookup if cached in local system file array */
-       if (is_in_system_inode_array(osb, type, slot))
-               arr = &(osb->system_inodes[type]);
+       if (is_global_system_inode(type)) {
+               arr = &(osb->global_system_inodes[type]);
+       } else
+               arr = get_local_system_inode(osb, type, slot);
  
         if (arr && ((inode = *arr) != NULL)) {
                 /* get a ref in addition to the array ref */
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 63d069b..ae527be 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2378,6 +2378,8 @@ extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
  
  extern int generic_file_fsync(struct file *, int);
  
+extern int generic_check_addressable(unsigned, u64);
+
  #ifdef CONFIG_MIGRATION
  extern int buffer_migrate_page(struct address_space *,
                                 struct page *, struct page *);
author	Joel Becker <joel.becker@oracle.com>
	Fri, 15 Oct 2010 20:03:09 +0000 (13:03 -0700)
committer	Joel Becker <joel.becker@oracle.com>
	Fri, 15 Oct 2010 20:03:09 +0000 (13:03 -0700)
Documentation/filesystems/ocfs2.txt		patch \| blob \| history
fs/ext3/super.c		patch \| blob \| history
fs/ext4/super.c		patch \| blob \| history
fs/jbd2/journal.c		patch \| blob \| history
fs/libfs.c		patch \| blob \| history
fs/ocfs2/aops.c		patch \| blob \| history
fs/ocfs2/aops.h		patch \| blob \| history
fs/ocfs2/dcache.c		patch \| blob \| history
fs/ocfs2/dcache.h		patch \| blob \| history
fs/ocfs2/dlm/dlmdebug.c		patch \| blob \| history
fs/ocfs2/dlmglue.c		patch \| blob \| history
fs/ocfs2/file.c		patch \| blob \| history
fs/ocfs2/inode.c		patch \| blob \| history
fs/ocfs2/inode.h		patch \| blob \| history
fs/ocfs2/ioctl.c		patch \| blob \| history
fs/ocfs2/journal.c		patch \| blob \| history
fs/ocfs2/journal.h		patch \| blob \| history
fs/ocfs2/mmap.c		patch \| blob \| history
fs/ocfs2/namei.c		patch \| blob \| history
fs/ocfs2/ocfs2.h		patch \| blob \| history
fs/ocfs2/ocfs2_fs.h		patch \| blob \| history
fs/ocfs2/ocfs2_ioctl.h		patch \| blob \| history
fs/ocfs2/refcounttree.c		patch \| blob \| history
fs/ocfs2/refcounttree.h		patch \| blob \| history
fs/ocfs2/slot_map.c		patch \| blob \| history
fs/ocfs2/suballoc.c		patch \| blob \| history
fs/ocfs2/super.c		patch \| blob \| history
fs/ocfs2/sysfile.c		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history