ext4: Fix bigalloc quota accounting and i_blocks value

[pandora-kernel.git] / fs / ext4 / super.c
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index e2d88ba..66b8cfa 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -124,7 +124,7 @@ void *ext4_kvzalloc(size_t size, gfp_t flags)
  {
         void *ret;
  
-       ret = kmalloc(size, flags);
+       ret = kzalloc(size, flags);
         if (!ret)
                 ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
         return ret;
@@ -414,6 +414,22 @@ static void save_error_info(struct super_block *sb, const char *func,
         ext4_commit_super(sb, 1);
  }
  
+/*
+ * The del_gendisk() function uninitializes the disk-specific data
+ * structures, including the bdi structure, without telling anyone
+ * else.  Once this happens, any attempt to call mark_buffer_dirty()
+ * (for example, by ext4_commit_super), will cause a kernel OOPS.
+ * This is a kludge to prevent these oops until we can put in a proper
+ * hook in del_gendisk() to inform the VFS and file system layers.
+ */
+static int block_device_ejected(struct super_block *sb)
+{
+       struct inode *bd_inode = sb->s_bdev->bd_inode;
+       struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+
+       return bdi->dev == NULL;
+}
+
  
  /* Deal with the reporting of failure conditions on a filesystem such as
   * inconsistencies detected or read IO failures.
@@ -821,10 +837,10 @@ static void ext4_put_super(struct super_block *sb)
                 brelse(sbi->s_group_desc[i]);
         ext4_kvfree(sbi->s_group_desc);
         ext4_kvfree(sbi->s_flex_groups);
-       percpu_counter_destroy(&sbi->s_freeblocks_counter);
+       percpu_counter_destroy(&sbi->s_freeclusters_counter);
         percpu_counter_destroy(&sbi->s_freeinodes_counter);
         percpu_counter_destroy(&sbi->s_dirs_counter);
-       percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+       percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
         brelse(sbi->s_sbh);
  #ifdef CONFIG_QUOTA
         for (i = 0; i < MAXQUOTAS; i++)
@@ -919,7 +935,6 @@ static void ext4_i_callback(struct rcu_head *head)
  
  static void ext4_destroy_inode(struct inode *inode)
  {
-       ext4_ioend_wait(inode);
         if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
                 ext4_msg(inode->i_sb, KERN_ERR,
                          "Inode %lu (%p): orphan list check failed!",
@@ -1802,6 +1817,7 @@ set_qf_format:
                         break;
                 case Opt_nodelalloc:
                         clear_opt(sb, DELALLOC);
+                       clear_opt2(sb, EXPLICIT_DELALLOC);
                         break;
                 case Opt_mblk_io_submit:
                         set_opt(sb, MBLK_IO_SUBMIT);
@@ -1818,6 +1834,7 @@ set_qf_format:
                         break;
                 case Opt_delalloc:
                         set_opt(sb, DELALLOC);
+                       set_opt2(sb, EXPLICIT_DELALLOC);
                         break;
                 case Opt_block_validity:
                         set_opt(sb, BLOCK_VALIDITY);
@@ -1936,7 +1953,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                 res = MS_RDONLY;
         }
         if (read_only)
-               return res;
+               goto done;
         if (!(sbi->s_mount_state & EXT4_VALID_FS))
                 ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
                          "running e2fsck is recommended");
@@ -1967,6 +1984,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
  
         ext4_commit_super(sb, 1);
+done:
         if (test_opt(sb, DEBUG))
                 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                 "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
@@ -2017,7 +2035,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
                 atomic_add(ext4_free_inodes_count(sb, gdp),
                            &sbi->s_flex_groups[flex_group].free_inodes);
                 atomic_add(ext4_free_blks_count(sb, gdp),
-                          &sbi->s_flex_groups[flex_group].free_blocks);
+                          &sbi->s_flex_groups[flex_group].free_clusters);
                 atomic_add(ext4_used_dirs_count(sb, gdp),
                            &sbi->s_flex_groups[flex_group].used_dirs);
         }
@@ -2455,7 +2473,8 @@ static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
                                               char *buf)
  {
         return snprintf(buf, PAGE_SIZE, "%llu\n",
-                       (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+               (s64) EXT4_C2B(sbi,
+                       percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
  }
  
  static ssize_t session_write_kbytes_show(struct ext4_attr *a,
@@ -2683,6 +2702,13 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
                         return 0;
                 }
         }
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
+           !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+               ext4_msg(sb, KERN_ERR,
+                        "Can't support bigalloc feature without "
+                        "extents feature\n");
+               return 0;
+       }
         return 1;
  }
  
@@ -3088,10 +3114,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         char *cp;
         const char *descr;
         int ret = -ENOMEM;
-       int blocksize;
+       int blocksize, clustersize;
         unsigned int db_count;
         unsigned int i;
-       int needs_recovery, has_huge_files;
+       int needs_recovery, has_huge_files, has_bigalloc;
         __u64 blocks_count;
         int err;
         unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -3225,6 +3251,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                            &journal_ioprio, NULL, 0))
                 goto failed_mount;
  
+       if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+               printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
+                           "with data=journal disables delayed "
+                           "allocation and O_DIRECT support!\n");
+               if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+                       ext4_msg(sb, KERN_ERR, "can't mount with "
+                                "both data=journal and delalloc");
+                       goto failed_mount;
+               }
+               if (test_opt(sb, DIOREAD_NOLOCK)) {
+                       ext4_msg(sb, KERN_ERR, "can't mount with "
+                                "both data=journal and delalloc");
+                       goto failed_mount;
+               }
+               if (test_opt(sb, DELALLOC))
+                       clear_opt(sb, DELALLOC);
+       }
+
+       blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+       if (test_opt(sb, DIOREAD_NOLOCK)) {
+               if (blocksize < PAGE_SIZE) {
+                       ext4_msg(sb, KERN_ERR, "can't mount with "
+                                "dioread_nolock if block size != PAGE_SIZE");
+                       goto failed_mount;
+               }
+       }
+
         sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
                 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
  
@@ -3266,8 +3319,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
                 goto failed_mount;
  
-       blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
-
         if (blocksize < EXT4_MIN_BLOCK_SIZE ||
             blocksize > EXT4_MAX_BLOCK_SIZE) {
                 ext4_msg(sb, KERN_ERR,
@@ -3370,12 +3421,53 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 sb->s_dirt = 1;
         }
  
-       if (sbi->s_blocks_per_group > blocksize * 8) {
-               ext4_msg(sb, KERN_ERR,
-                      "#blocks per group too big: %lu",
-                      sbi->s_blocks_per_group);
-               goto failed_mount;
+       /* Handle clustersize */
+       clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+       has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                               EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+       if (has_bigalloc) {
+               if (clustersize < blocksize) {
+                       ext4_msg(sb, KERN_ERR,
+                                "cluster size (%d) smaller than "
+                                "block size (%d)", clustersize, blocksize);
+                       goto failed_mount;
+               }
+               sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+                       le32_to_cpu(es->s_log_block_size);
+               sbi->s_clusters_per_group =
+                       le32_to_cpu(es->s_clusters_per_group);
+               if (sbi->s_clusters_per_group > blocksize * 8) {
+                       ext4_msg(sb, KERN_ERR,
+                                "#clusters per group too big: %lu",
+                                sbi->s_clusters_per_group);
+                       goto failed_mount;
+               }
+               if (sbi->s_blocks_per_group !=
+                   (sbi->s_clusters_per_group * (clustersize / blocksize))) {
+                       ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+                                "clusters per group (%lu) inconsistent",
+                                sbi->s_blocks_per_group,
+                                sbi->s_clusters_per_group);
+                       goto failed_mount;
+               }
+       } else {
+               if (clustersize != blocksize) {
+                       ext4_warning(sb, "fragment/cluster size (%d) != "
+                                    "block size (%d)", clustersize,
+                                    blocksize);
+                       clustersize = blocksize;
+               }
+               if (sbi->s_blocks_per_group > blocksize * 8) {
+                       ext4_msg(sb, KERN_ERR,
+                                "#blocks per group too big: %lu",
+                                sbi->s_blocks_per_group);
+                       goto failed_mount;
+               }
+               sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+               sbi->s_cluster_bits = 0;
         }
+       sbi->s_cluster_ratio = clustersize / blocksize;
+
         if (sbi->s_inodes_per_group > blocksize * 8) {
                 ext4_msg(sb, KERN_ERR,
                        "#inodes per group too big: %lu",
@@ -3484,7 +3576,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         sbi->s_err_report.function = print_daily_error_info;
         sbi->s_err_report.data = (unsigned long) sb;
  
-       err = percpu_counter_init(&sbi->s_freeblocks_counter,
+       err = percpu_counter_init(&sbi->s_freeclusters_counter,
                         ext4_count_free_blocks(sb));
         if (!err) {
                 err = percpu_counter_init(&sbi->s_freeinodes_counter,
@@ -3495,7 +3587,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                 ext4_count_dirs(sb));
         }
         if (!err) {
-               err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+               err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
         }
         if (err) {
                 ext4_msg(sb, KERN_ERR, "insufficient memory");
@@ -3610,13 +3702,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
          * The journal may have updated the bg summary counts, so we
          * need to update the global counters.
          */
-       percpu_counter_set(&sbi->s_freeblocks_counter,
+       percpu_counter_set(&sbi->s_freeclusters_counter,
                            ext4_count_free_blocks(sb));
         percpu_counter_set(&sbi->s_freeinodes_counter,
                            ext4_count_free_inodes(sb));
         percpu_counter_set(&sbi->s_dirs_counter,
                            ext4_count_dirs(sb));
-       percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
+       percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
  
  no_journal:
         /*
@@ -3680,25 +3772,6 @@ no_journal:
                          "available");
         }
  
-       if (test_opt(sb, DELALLOC) &&
-           (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
-               ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
-                        "requested data journaling mode");
-               clear_opt(sb, DELALLOC);
-       }
-       if (test_opt(sb, DIOREAD_NOLOCK)) {
-               if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-                       ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
-                               "option - requested data journaling mode");
-                       clear_opt(sb, DIOREAD_NOLOCK);
-               }
-               if (sb->s_blocksize < PAGE_SIZE) {
-                       ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
-                               "option - block size is too small");
-                       clear_opt(sb, DIOREAD_NOLOCK);
-               }
-       }
-
         err = ext4_setup_system_zone(sb);
         if (err) {
                 ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -3775,10 +3848,10 @@ failed_mount3:
         del_timer(&sbi->s_err_report);
         if (sbi->s_flex_groups)
                 ext4_kvfree(sbi->s_flex_groups);
-       percpu_counter_destroy(&sbi->s_freeblocks_counter);
+       percpu_counter_destroy(&sbi->s_freeclusters_counter);
         percpu_counter_destroy(&sbi->s_freeinodes_counter);
         percpu_counter_destroy(&sbi->s_dirs_counter);
-       percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+       percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
         if (sbi->s_mmp_tsk)
                 kthread_stop(sbi->s_mmp_tsk);
  failed_mount2:
@@ -4065,7 +4138,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
         struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
         int error = 0;
  
-       if (!sbh)
+       if (!sbh || block_device_ejected(sb))
                 return error;
         if (buffer_write_io_error(sbh)) {
                 /*
@@ -4101,8 +4174,9 @@ static int ext4_commit_super(struct super_block *sb, int sync)
         else
                 es->s_kbytes_written =
                         cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-       ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
-                                          &EXT4_SB(sb)->s_freeblocks_counter));
+       ext4_free_blocks_count_set(es,
+                       EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
+                               &EXT4_SB(sb)->s_freeclusters_counter)));
         es->s_free_inodes_count =
                 cpu_to_le32(percpu_counter_sum_positive(
                                 &EXT4_SB(sb)->s_freeinodes_counter));
@@ -4507,16 +4581,34 @@ restore_opts:
         return err;
  }
  
+/*
+ * Note: calculating the overhead so we can be compatible with
+ * historical BSD practice is quite difficult in the face of
+ * clusters/bigalloc.  This is because multiple metadata blocks from
+ * different block group can end up in the same allocation cluster.
+ * Calculating the exact overhead in the face of clustered allocation
+ * requires either O(all block bitmaps) in memory or O(number of block
+ * groups**2) in time.  We will still calculate the superblock for
+ * older file systems --- and if we come across with a bigalloc file
+ * system with zero in s_overhead_clusters the estimate will be close to
+ * correct especially for very large cluster sizes --- but for newer
+ * file systems, it's better to calculate this figure once at mkfs
+ * time, and store it in the superblock.  If the superblock value is
+ * present (even for non-bigalloc file systems), we will use it.
+ */
  static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
  {
         struct super_block *sb = dentry->d_sb;
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         struct ext4_super_block *es = sbi->s_es;
+       struct ext4_group_desc *gdp;
         u64 fsid;
         s64 bfree;
  
         if (test_opt(sb, MINIX_DF)) {
                 sbi->s_overhead_last = 0;
+       } else if (es->s_overhead_clusters) {
+               sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters);
         } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
                 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
                 ext4_fsblk_t overhead = 0;
@@ -4531,24 +4623,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
                  * All of the blocks before first_data_block are
                  * overhead
                  */
-               overhead = le32_to_cpu(es->s_first_data_block);
+               overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
  
                 /*
-                * Add the overhead attributed to the superblock and
-                * block group descriptors.  If the sparse superblocks
-                * feature is turned on, then not all groups have this.
+                * Add the overhead found in each block group
                  */
                 for (i = 0; i < ngroups; i++) {
-                       overhead += ext4_bg_has_super(sb, i) +
-                               ext4_bg_num_gdb(sb, i);
+                       gdp = ext4_get_group_desc(sb, i, NULL);
+                       overhead += ext4_num_overhead_clusters(sb, i, gdp);
                         cond_resched();
                 }
-
-               /*
-                * Every block group has an inode bitmap, a block
-                * bitmap, and an inode table.
-                */
-               overhead += ngroups * (2 + sbi->s_itb_per_group);
                 sbi->s_overhead_last = overhead;
                 smp_wmb();
                 sbi->s_blocks_last = ext4_blocks_count(es);
@@ -4556,11 +4640,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
  
         buf->f_type = EXT4_SUPER_MAGIC;
         buf->f_bsize = sb->s_blocksize;
-       buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-       bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
-                      percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+       buf->f_blocks = (ext4_blocks_count(es) -
+                        EXT4_C2B(sbi, sbi->s_overhead_last));
+       bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
+               percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
         /* prevent underflow in case that few free space is available */
-       buf->f_bfree = max_t(s64, bfree, 0);
+       buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
         buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
         if (buf->f_bfree < ext4_r_blocks_count(es))
                 buf->f_bavail = 0;