ext4: add fsync batch tuning knobs
authorTheodore Ts'o <tytso@mit.edu>
Sun, 4 Jan 2009 01:27:38 +0000 (20:27 -0500)
committerTheodore Ts'o <tytso@mit.edu>
Sun, 4 Jan 2009 01:27:38 +0000 (20:27 -0500)
Add new mount options, min_batch_time and max_batch_time, which
controls how long the jbd2 layer should wait for additional filesystem
operations to get batched with a synchronous write transaction.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Documentation/filesystems/ext4.txt
fs/ext4/ext4.h
fs/ext4/ext4_sb.h
fs/ext4/super.c
fs/jbd2/journal.c
fs/jbd2/transaction.c
include/linux/jbd2.h

index f75ab10..e3fcbea 100644 (file)
@@ -283,6 +283,35 @@ delalloc   (*)     Deferring block allocation until write-out time.
 nodelalloc             Disable delayed allocation. Blocks are allocation
                        when data is copied from user to page cache.
 
+max_batch_time=usec    Maximum amount of time ext4 should wait for
+                       additional filesystem operations to be batch
+                       together with a synchronous write operation.
+                       Since a synchronous write operation is going to
+                       force a commit and then a wait for the I/O
+                       complete, it doesn't cost much, and can be a
+                       huge throughput win, we wait for a small amount
+                       of time to see if any other transactions can
+                       piggyback on the synchronous write.   The
+                       algorithm used is designed to automatically tune
+                       for the speed of the disk, by measuring the
+                       amount of time (on average) that it takes to
+                       finish committing a transaction.  Call this time
+                       the "commit time".  If the time that the
+                       transactoin has been running is less than the
+                       commit time, ext4 will try sleeping for the
+                       commit time to see if other operations will join
+                       the transaction.   The commit time is capped by
+                       the max_batch_time, which defaults to 15000us
+                       (15ms).   This optimization can be turned off
+                       entirely by setting max_batch_time to 0.
+
+min_batch_time=usec    This parameter sets the commit time (as
+                       described above) to be at least min_batch_time.
+                       It defaults to zero microseconds.  Increasing
+                       this parameter may improve the throughput of
+                       multi-threaded, synchronous workloads on very
+                       fast disks, at the cost of increasing latency.
+
 Data Mode
 =========
 There are 3 different data modes:
index ac8551e..9ba9fd6 100644 (file)
@@ -328,6 +328,7 @@ struct ext4_mount_options {
        uid_t s_resuid;
        gid_t s_resgid;
        unsigned long s_commit_interval;
+       u32 s_min_batch_time, s_max_batch_time;
 #ifdef CONFIG_QUOTA
        int s_jquota_fmt;
        char *s_qf_names[MAXQUOTAS];
@@ -805,6 +806,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEFM_JMODE_ORDERED        0x0040
 #define EXT4_DEFM_JMODE_WBACK  0x0060
 
+/*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME        0
+#define EXT4_DEF_MAX_BATCH_TIME        15000 /* 15ms */
+
 /*
  * Structure of a directory entry
  */
index 3db800f..039b6ea 100644 (file)
@@ -74,6 +74,8 @@ struct ext4_sb_info {
        struct journal_s *s_journal;
        struct list_head s_orphan;
        unsigned long s_commit_interval;
+       u32 s_max_batch_time;
+       u32 s_min_batch_time;
        struct block_device *journal_bdev;
 #ifdef CONFIG_JBD2_DEBUG
        struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
index dc27d4c..da377f9 100644 (file)
@@ -705,10 +705,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 #endif
        if (!test_opt(sb, RESERVATION))
                seq_puts(seq, ",noreservation");
-       if (sbi->s_commit_interval) {
+       if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
                seq_printf(seq, ",commit=%u",
                           (unsigned) (sbi->s_commit_interval / HZ));
        }
+       if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
+               seq_printf(seq, ",min_batch_time=%u",
+                          (unsigned) sbi->s_min_batch_time);
+       }
+       if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
+               seq_printf(seq, ",max_batch_time=%u",
+                          (unsigned) sbi->s_min_batch_time);
+       }
+
        /*
         * We're changing the default of barrier mount option, so
         * let's always display its mount state so it's clear what its
@@ -874,7 +883,8 @@ enum {
        Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
        Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
-       Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+       Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
+       Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_data_err_abort, Opt_data_err_ignore,
@@ -913,6 +923,8 @@ static const match_table_t tokens = {
        {Opt_nobh, "nobh"},
        {Opt_bh, "bh"},
        {Opt_commit, "commit=%u"},
+       {Opt_min_batch_time, "min_batch_time=%u"},
+       {Opt_max_batch_time, "max_batch_time=%u"},
        {Opt_journal_update, "journal=update"},
        {Opt_journal_inum, "journal=%u"},
        {Opt_journal_dev, "journal_dev=%u"},
@@ -1131,6 +1143,22 @@ static int parse_options(char *options, struct super_block *sb,
                                option = JBD2_DEFAULT_MAX_COMMIT_AGE;
                        sbi->s_commit_interval = HZ * option;
                        break;
+               case Opt_max_batch_time:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       if (option < 0)
+                               return 0;
+                       if (option == 0)
+                               option = EXT4_DEF_MAX_BATCH_TIME;
+                       sbi->s_max_batch_time = option;
+                       break;
+               case Opt_min_batch_time:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       if (option < 0)
+                               return 0;
+                       sbi->s_min_batch_time = option;
+                       break;
                case Opt_data_journal:
                        data_opt = EXT4_MOUNT_JOURNAL_DATA;
                        goto datacheck;
@@ -1979,6 +2007,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+       sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+       sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+       sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
        set_opt(sbi->s_mount_opt, RESERVATION);
        set_opt(sbi->s_mount_opt, BARRIER);
@@ -2524,11 +2555,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-       if (sbi->s_commit_interval)
-               journal->j_commit_interval = sbi->s_commit_interval;
-       /* We could also set up an ext4-specific default for the commit
-        * interval here, but for now we'll just fall back to the jbd
-        * default. */
+       journal->j_commit_interval = sbi->s_commit_interval;
+       journal->j_min_batch_time = sbi->s_min_batch_time;
+       journal->j_max_batch_time = sbi->s_max_batch_time;
 
        spin_lock(&journal->j_state_lock);
        if (test_opt(sb, BARRIER))
@@ -3042,6 +3071,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
+       old_opts.s_min_batch_time = sbi->s_min_batch_time;
+       old_opts.s_max_batch_time = sbi->s_max_batch_time;
 #ifdef CONFIG_QUOTA
        old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
        for (i = 0; i < MAXQUOTAS; i++)
@@ -3178,6 +3209,8 @@ restore_opts:
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
+       sbi->s_min_batch_time = old_opts.s_min_batch_time;
+       sbi->s_max_batch_time = old_opts.s_max_batch_time;
 #ifdef CONFIG_QUOTA
        sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
        for (i = 0; i < MAXQUOTAS; i++) {
index 74d8729..fd1d755 100644 (file)
@@ -964,6 +964,8 @@ static journal_t * journal_init_common (void)
        spin_lock_init(&journal->j_state_lock);
 
        journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
+       journal->j_min_batch_time = 0;
+       journal->j_max_batch_time = 15000; /* 15ms */
 
        /* The journal is marked for error until we succeed with recovery! */
        journal->j_flags = JBD2_ABORT;
index 13dcbc9..48c21ba 100644 (file)
@@ -1255,8 +1255,10 @@ int jbd2_journal_stop(handle_t *handle)
                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
                                                   transaction->t_start_time));
 
+               commit_time = max_t(u64, commit_time,
+                                   1000*journal->j_min_batch_time);
                commit_time = min_t(u64, commit_time,
-                                   1000*jiffies_to_usecs(1));
+                                   1000*journal->j_max_batch_time);
 
                if (trans_time < commit_time) {
                        ktime_t expires = ktime_add_ns(ktime_get(),
index ab8cef1..a3cd647 100644 (file)
@@ -956,6 +956,14 @@ struct journal_s
         */
        u64                     j_average_commit_time;
 
+       /*
+        * minimum and maximum times that we should wait for
+        * additional filesystem operations to get batched into a
+        * synchronous handle in microseconds
+        */
+       u32                     j_min_batch_time;
+       u32                     j_max_batch_time;
+
        /* This function is called when a transaction is closed */
        void                    (*j_commit_callback)(journal_t *,
                                                     transaction_t *);