ext4: fix waiting and sending of a barrier in ext4_sync_file()

[pandora-kernel.git] / fs / ext4 / super.c
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index 22546ad..874dd25 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -75,11 +75,27 @@ static void ext4_write_super(struct super_block *sb);
  static int ext4_freeze(struct super_block *sb);
  static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
                        const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
  static int ext4_feature_set_ok(struct super_block *sb, int readonly);
  static void ext4_destroy_lazyinit_thread(void);
  static void ext4_unregister_li_request(struct super_block *sb);
  static void ext4_clear_request_list(void);
  
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+       .owner          = THIS_MODULE,
+       .name           = "ext2",
+       .mount          = ext4_mount,
+       .kill_sb        = kill_block_super,
+       .fs_flags       = FS_REQUIRES_DEV,
+};
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
+
+
  #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
  static struct file_system_type ext3_fs_type = {
         .owner          = THIS_MODULE,
@@ -242,27 +258,44 @@ static void ext4_put_nojournal(handle_t *handle)
   * journal_end calls result in the superblock being marked dirty, so
   * that sync() will call the filesystem's write_super callback if
   * appropriate.
+ *
+ * To avoid j_barrier hold in userspace when a user calls freeze(),
+ * ext4 prevents a new handle from being started by s_frozen, which
+ * is in an upper layer.
   */
  handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
  {
         journal_t *journal;
+       handle_t  *handle;
  
         if (sb->s_flags & MS_RDONLY)
                 return ERR_PTR(-EROFS);
  
-       vfs_check_frozen(sb, SB_FREEZE_TRANS);
-       /* Special case here: if the journal has aborted behind our
-        * backs (eg. EIO in the commit thread), then we still need to
-        * take the FS itself readonly cleanly. */
         journal = EXT4_SB(sb)->s_journal;
-       if (journal) {
-               if (is_journal_aborted(journal)) {
-                       ext4_abort(sb, "Detected aborted journal");
-                       return ERR_PTR(-EROFS);
-               }
-               return jbd2_journal_start(journal, nblocks);
+       handle = ext4_journal_current_handle();
+
+       /*
+        * If a handle has been started, it should be allowed to
+        * finish, otherwise deadlock could happen between freeze
+        * and others(e.g. truncate) due to the restart of the
+        * journal handle if the filesystem is forzen and active
+        * handles are not stopped.
+        */
+       if (!handle)
+               vfs_check_frozen(sb, SB_FREEZE_TRANS);
+
+       if (!journal)
+               return ext4_get_nojournal();
+       /*
+        * Special case here: if the journal has aborted behind our
+        * backs (eg. EIO in the commit thread), then we still need to
+        * take the FS itself readonly cleanly.
+        */
+       if (is_journal_aborted(journal)) {
+               ext4_abort(sb, "Detected aborted journal");
+               return ERR_PTR(-EROFS);
         }
-       return ext4_get_nojournal();
+       return jbd2_journal_start(journal, nblocks);
  }
  
  /*
@@ -617,7 +650,7 @@ __acquires(bitlock)
          * filesystem will have already been marked read/only and the
          * journal has been aborted.  We return 1 as a hint to callers
          * who might what to use the return value from
-        * ext4_grp_locked_error() to distinguish beween the
+        * ext4_grp_locked_error() to distinguish between the
          * ERRORS_CONT and ERRORS_RO case, and perhaps return more
          * aggressively from the ext4 function in question, with a
          * more appropriate error code.
@@ -1079,7 +1112,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
  
         if (!test_opt(sb, INIT_INODE_TABLE))
                 seq_puts(seq, ",noinit_inode_table");
-       else if (sbi->s_li_wait_mult)
+       else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
                 seq_printf(seq, ",init_inode_table=%u",
                            (unsigned) sbi->s_li_wait_mult);
  
@@ -1170,9 +1203,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                 const char *data, size_t len, loff_t off);
  
  static const struct dquot_operations ext4_quota_operations = {
-#ifdef CONFIG_QUOTA
         .get_reserved_space = ext4_get_reserved_space,
-#endif
         .write_dquot    = ext4_write_dquot,
         .acquire_dquot  = ext4_acquire_dquot,
         .release_dquot  = ext4_release_dquot,
@@ -1883,7 +1914,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                 ext4_msg(sb, KERN_WARNING,
                          "warning: mounting fs with errors, "
                          "running e2fsck is recommended");
-       else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+       else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
                  le16_to_cpu(es->s_mnt_count) >=
                  (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
                 ext4_msg(sb, KERN_WARNING,
@@ -2408,6 +2439,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
                           EXT4_SB(sb)->s_sectors_written_start) >> 1)));
  }
  
+static ssize_t extent_cache_hits_show(struct ext4_attr *a,
+                                     struct ext4_sb_info *sbi, char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
+}
+
+static ssize_t extent_cache_misses_show(struct ext4_attr *a,
+                                       struct ext4_sb_info *sbi, char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
+}
+
  static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
                                           struct ext4_sb_info *sbi,
                                           const char *buf, size_t count)
@@ -2465,6 +2508,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
  EXT4_RO_ATTR(delayed_allocation_blocks);
  EXT4_RO_ATTR(session_write_kbytes);
  EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RO_ATTR(extent_cache_hits);
+EXT4_RO_ATTR(extent_cache_misses);
  EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
                  inode_readahead_blks_store, s_inode_readahead_blks);
  EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2480,6 +2525,8 @@ static struct attribute *ext4_attrs[] = {
         ATTR_LIST(delayed_allocation_blocks),
         ATTR_LIST(session_write_kbytes),
         ATTR_LIST(lifetime_write_kbytes),
+       ATTR_LIST(extent_cache_hits),
+       ATTR_LIST(extent_cache_misses),
         ATTR_LIST(inode_readahead_blks),
         ATTR_LIST(inode_goal),
         ATTR_LIST(mb_stats),
@@ -2642,12 +2689,6 @@ static void print_daily_error_info(unsigned long arg)
         mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
  }
  
-static void ext4_lazyinode_timeout(unsigned long data)
-{
-       struct task_struct *p = (struct task_struct *)data;
-       wake_up_process(p);
-}
-
  /* Find next suitable group and run ext4_init_inode_table */
  static int ext4_run_li_request(struct ext4_li_request *elr)
  {
@@ -2679,11 +2720,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
                 ret = ext4_init_inode_table(sb, group,
                                             elr->lr_timeout ? 0 : 1);
                 if (elr->lr_timeout == 0) {
-                       timeout = jiffies - timeout;
-                       if (elr->lr_sbi->s_li_wait_mult)
-                               timeout *= elr->lr_sbi->s_li_wait_mult;
-                       else
-                               timeout *= 20;
+                       timeout = (jiffies - timeout) *
+                                 elr->lr_sbi->s_li_wait_mult;
                         elr->lr_timeout = timeout;
                 }
                 elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -2695,7 +2733,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
  
  /*
   * Remove lr_request from the list_request and free the
- * request tructure. Should be called with li_list_mtx held
+ * request structure. Should be called with li_list_mtx held
   */
  static void ext4_remove_li_request(struct ext4_li_request *elr)
  {
@@ -2713,14 +2751,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
  
  static void ext4_unregister_li_request(struct super_block *sb)
  {
-       struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
-
-       if (!ext4_li_info)
+       mutex_lock(&ext4_li_mtx);
+       if (!ext4_li_info) {
+               mutex_unlock(&ext4_li_mtx);
                 return;
+       }
  
         mutex_lock(&ext4_li_info->li_list_mtx);
-       ext4_remove_li_request(elr);
+       ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
         mutex_unlock(&ext4_li_info->li_list_mtx);
+       mutex_unlock(&ext4_li_mtx);
  }
  
  static struct task_struct *ext4_lazyinit_task;
@@ -2739,17 +2779,10 @@ static int ext4_lazyinit_thread(void *arg)
         struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
         struct list_head *pos, *n;
         struct ext4_li_request *elr;
-       unsigned long next_wakeup;
-       DEFINE_WAIT(wait);
+       unsigned long next_wakeup, cur;
  
         BUG_ON(NULL == eli);
  
-       eli->li_timer.data = (unsigned long)current;
-       eli->li_timer.function = ext4_lazyinode_timeout;
-
-       eli->li_task = current;
-       wake_up(&eli->li_wait_task);
-
  cont_thread:
         while (true) {
                 next_wakeup = MAX_JIFFY_OFFSET;
@@ -2780,19 +2813,15 @@ cont_thread:
                 if (freezing(current))
                         refrigerator();
  
-               if ((time_after_eq(jiffies, next_wakeup)) ||
+               cur = jiffies;
+               if ((time_after_eq(cur, next_wakeup)) ||
                     (MAX_JIFFY_OFFSET == next_wakeup)) {
                         cond_resched();
                         continue;
                 }
  
-               eli->li_timer.expires = next_wakeup;
-               add_timer(&eli->li_timer);
-               prepare_to_wait(&eli->li_wait_daemon, &wait,
-                               TASK_INTERRUPTIBLE);
-               if (time_before(jiffies, next_wakeup))
-                       schedule();
-               finish_wait(&eli->li_wait_daemon, &wait);
+               schedule_timeout_interruptible(next_wakeup - cur);
+
                 if (kthread_should_stop()) {
                         ext4_clear_request_list();
                         goto exit_thread;
@@ -2816,12 +2845,7 @@ exit_thread:
                 goto cont_thread;
         }
         mutex_unlock(&eli->li_list_mtx);
-       del_timer_sync(&ext4_li_info->li_timer);
-       eli->li_task = NULL;
-       wake_up(&eli->li_wait_task);
-
         kfree(ext4_li_info);
-       ext4_lazyinit_task = NULL;
         ext4_li_info = NULL;
         mutex_unlock(&ext4_li_mtx);
  
@@ -2849,7 +2873,6 @@ static int ext4_run_lazyinit_thread(void)
         if (IS_ERR(ext4_lazyinit_task)) {
                 int err = PTR_ERR(ext4_lazyinit_task);
                 ext4_clear_request_list();
-               del_timer_sync(&ext4_li_info->li_timer);
                 kfree(ext4_li_info);
                 ext4_li_info = NULL;
                 printk(KERN_CRIT "EXT4: error %d creating inode table "
@@ -2858,8 +2881,6 @@ static int ext4_run_lazyinit_thread(void)
                 return err;
         }
         ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
-
-       wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
         return 0;
  }
  
@@ -2894,13 +2915,9 @@ static int ext4_li_info_new(void)
         if (!eli)
                 return -ENOMEM;
  
-       eli->li_task = NULL;
         INIT_LIST_HEAD(&eli->li_request_list);
         mutex_init(&eli->li_list_mtx);
  
-       init_waitqueue_head(&eli->li_wait_daemon);
-       init_waitqueue_head(&eli->li_wait_task);
-       init_timer(&eli->li_timer);
         eli->li_state |= EXT4_LAZYINIT_QUIT;
  
         ext4_li_info = eli;
@@ -2943,20 +2960,19 @@ static int ext4_register_li_request(struct super_block *sb,
         ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
         int ret = 0;
  
-       if (sbi->s_li_request != NULL)
+       if (sbi->s_li_request != NULL) {
+               /*
+                * Reset timeout so it can be computed again, because
+                * s_li_wait_mult might have changed.
+                */
+               sbi->s_li_request->lr_timeout = 0;
                 return 0;
+       }
  
         if (first_not_zeroed == ngroups ||
             (sb->s_flags & MS_RDONLY) ||
-           !test_opt(sb, INIT_INODE_TABLE)) {
-               sbi->s_li_request = NULL;
-               return 0;
-       }
-
-       if (first_not_zeroed == ngroups) {
-               sbi->s_li_request = NULL;
+           !test_opt(sb, INIT_INODE_TABLE))
                 return 0;
-       }
  
         elr = ext4_li_request_new(sb, first_not_zeroed);
         if (!elr)
@@ -2975,6 +2991,12 @@ static int ext4_register_li_request(struct super_block *sb,
         mutex_unlock(&ext4_li_info->li_list_mtx);
  
         sbi->s_li_request = elr;
+       /*
+        * set elr to NULL here since it has been inserted to
+        * the request_list and the removal and free of it is
+        * handled by ext4_clear_request_list from now on.
+        */
+       elr = NULL;
  
         if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
                 ret = ext4_run_lazyinit_thread();
@@ -3143,6 +3165,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
             ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
                 set_opt(sb, DELALLOC);
  
+       /*
+        * set default s_li_wait_mult for lazyinit, for the case there is
+        * no mount option specified.
+        */
+       sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+
         if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
                            &journal_devnum, &journal_ioprio, NULL, 0)) {
                 ext4_msg(sb, KERN_WARNING,
@@ -3164,6 +3192,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        "feature flags set on rev 0 fs, "
                        "running e2fsck is recommended");
  
+       if (IS_EXT2_SB(sb)) {
+               if (ext2_feature_set_ok(sb))
+                       ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+                                "using the ext4 subsystem");
+               else {
+                       ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+                                "to feature incompatibilities");
+                       goto failed_mount;
+               }
+       }
+
+       if (IS_EXT3_SB(sb)) {
+               if (ext3_feature_set_ok(sb))
+                       ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+                                "using the ext4 subsystem");
+               else {
+                       ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+                                "to feature incompatibilities");
+                       goto failed_mount;
+               }
+       }
+
         /*
          * Check feature flags regardless of the revision level, since we
          * previously didn't change the revision level when setting the flags,
@@ -3385,6 +3435,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         get_random_bytes(&sbi->s_next_generation, sizeof(u32));
         spin_lock_init(&sbi->s_next_gen_lock);
  
+       init_timer(&sbi->s_err_report);
+       sbi->s_err_report.function = print_daily_error_info;
+       sbi->s_err_report.data = (unsigned long) sb;
+
         err = percpu_counter_init(&sbi->s_freeblocks_counter,
                         ext4_count_free_blocks(sb));
         if (!err) {
@@ -3447,7 +3501,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 goto failed_mount_wq;
         } else {
                 clear_opt(sb, DATA_FLAGS);
-               set_opt(sb, WRITEBACK_DATA);
                 sbi->s_journal = NULL;
                 needs_recovery = 0;
                 goto no_journal;
@@ -3646,9 +3699,6 @@ no_journal:
                  "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
                  *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
  
-       init_timer(&sbi->s_err_report);
-       sbi->s_err_report.function = print_daily_error_info;
-       sbi->s_err_report.data = (unsigned long) sb;
         if (es->s_error_count)
                 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
  
@@ -3672,6 +3722,7 @@ failed_mount_wq:
                 sbi->s_journal = NULL;
         }
  failed_mount3:
+       del_timer(&sbi->s_err_report);
         if (sbi->s_flex_groups) {
                 if (is_vmalloc_addr(sbi->s_flex_groups))
                         vfree(sbi->s_flex_groups);
@@ -4138,6 +4189,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
  /*
   * LVM calls this function before a (read-only) snapshot is created.  This
   * gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently, because ext4 prevents a new handle from being started
+ * by @sb->s_frozen, which stays in an upper layer.  It thus needs help from
+ * the upper layer.
   */
  static int ext4_freeze(struct super_block *sb)
  {
@@ -4614,17 +4670,33 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
  
  static int ext4_quota_off(struct super_block *sb, int type)
  {
+       struct inode *inode = sb_dqopt(sb)->files[type];
+       handle_t *handle;
+
         /* Force all delayed allocation blocks to be allocated.
          * Caller already holds s_umount sem */
         if (test_opt(sb, DELALLOC))
                 sync_filesystem(sb);
  
+       if (!inode)
+               goto out;
+
+       /* Update modification times of quota files when userspace can
+        * start looking at them */
+       handle = ext4_journal_start(inode, 1);
+       if (IS_ERR(handle))
+               goto out;
+       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+       ext4_mark_inode_dirty(handle, inode);
+       ext4_journal_stop(handle);
+
+out:
         return dquot_quota_off(sb, type);
  }
  
  /* Read data from quotafile - avoid pagecache and such because we cannot afford
   * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
   * we don't have to be afraid of races */
  static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                                size_t len, loff_t off)
@@ -4714,9 +4786,8 @@ out:
         if (inode->i_size < off + len) {
                 i_size_write(inode, off + len);
                 EXT4_I(inode)->i_disksize = inode->i_size;
+               ext4_mark_inode_dirty(handle, inode);
         }
-       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-       ext4_mark_inode_dirty(handle, inode);
         mutex_unlock(&inode->i_mutex);
         return len;
  }
@@ -4730,14 +4801,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
  }
  
  #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext2_fs_type = {
-       .owner          = THIS_MODULE,
-       .name           = "ext2",
-       .mount          = ext4_mount,
-       .kill_sb        = kill_block_super,
-       .fs_flags       = FS_REQUIRES_DEV,
-};
-
  static inline void register_as_ext2(void)
  {
         int err = register_filesystem(&ext2_fs_type);
@@ -4750,10 +4813,22 @@ static inline void unregister_as_ext2(void)
  {
         unregister_filesystem(&ext2_fs_type);
  }
+
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+               return 0;
+       if (sb->s_flags & MS_RDONLY)
+               return 1;
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+               return 0;
+       return 1;
+}
  MODULE_ALIAS("ext2");
  #else
  static inline void register_as_ext2(void) { }
  static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
  #endif
  
  #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4769,10 +4844,24 @@ static inline void unregister_as_ext3(void)
  {
         unregister_filesystem(&ext3_fs_type);
  }
+
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+               return 0;
+       if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+               return 0;
+       if (sb->s_flags & MS_RDONLY)
+               return 1;
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+               return 0;
+       return 1;
+}
  MODULE_ALIAS("ext3");
  #else
  static inline void register_as_ext3(void) { }
  static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
  #endif
  
  static struct file_system_type ext4_fs_type = {
@@ -4856,8 +4945,8 @@ static int __init ext4_init_fs(void)
         err = init_inodecache();
         if (err)
                 goto out1;
-       register_as_ext2();
         register_as_ext3();
+       register_as_ext2();
         err = register_filesystem(&ext4_fs_type);
         if (err)
                 goto out;