fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/crc32.h>
  25 #include <linux/writeback.h>
  26 #include <linux/backing-dev.h>
  27 #include <linux/bio.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/bitops.h>
  30 #include <trace/events/jbd2.h>
  31 #include <asm/system.h>
  32
  33 /*
  34  * Default IO end handler for temporary BJ_IO buffer_heads.
  35  */
  36 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  37 {
  38         BUFFER_TRACE(bh, "");
  39         if (uptodate)
  40                 set_buffer_uptodate(bh);
  41         else
  42                 clear_buffer_uptodate(bh);
  43         unlock_buffer(bh);
  44 }
  45
  46 /*
  47  * When an ext4 file is truncated, it is possible that some pages are not
  48  * successfully freed, because they are attached to a committing transaction.
  49  * After the transaction commits, these pages are left on the LRU, with no
  50  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  51  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  52  * the numbers in /proc/meminfo look odd.
  53  *
  54  * So here, we have a buffer which has just come off the forget list.  Look to
  55  * see if we can strip all buffers from the backing page.
  56  *
  57  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  58  * caller provided us with a ref against the buffer, and we drop that here.
  59  */
  60 static void release_buffer_page(struct buffer_head *bh)
  61 {
  62         struct page *page;
  63
  64         if (buffer_dirty(bh))
  65                 goto nope;
  66         if (atomic_read(&bh->b_count) != 1)
  67                 goto nope;
  68         page = bh->b_page;
  69         if (!page)
  70                 goto nope;
  71         if (page->mapping)
  72                 goto nope;
  73
  74         /* OK, it's a truncated page */
  75         if (!trylock_page(page))
  76                 goto nope;
  77
  78         page_cache_get(page);
  79         __brelse(bh);
  80         try_to_free_buffers(page);
  81         unlock_page(page);
  82         page_cache_release(page);
  83         return;
  84
  85 nope:
  86         __brelse(bh);
  87 }
  88
  89 /*
  90  * Done it all: now submit the commit record.  We should have
  91  * cleaned up our previous buffers by now, so if we are in abort
  92  * mode we can now just skip the rest of the journal write
  93  * entirely.
  94  *
  95  * Returns 1 if the journal needs to be aborted or 0 on success
  96  */
  97 static int journal_submit_commit_record(journal_t *journal,
  98                                         transaction_t *commit_transaction,
  99                                         struct buffer_head **cbh,
 100                                         __u32 crc32_sum)
 101 {
 102         struct journal_head *descriptor;
 103         struct commit_header *tmp;
 104         struct buffer_head *bh;
 105         int ret;
 106         struct timespec now = current_kernel_time();
 107
 108         if (is_journal_aborted(journal))
 109                 return 0;
 110
 111         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 112         if (!descriptor)
 113                 return 1;
 114
 115         bh = jh2bh(descriptor);
 116
 117         tmp = (struct commit_header *)bh->b_data;
 118         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 119         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 120         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 121         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 122         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 123
 124         if (JBD2_HAS_COMPAT_FEATURE(journal,
 125                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
 126                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 127                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 128                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 129         }
 130
 131         JBUFFER_TRACE(descriptor, "submit commit block");
 132         lock_buffer(bh);
 133         clear_buffer_dirty(bh);
 134         set_buffer_uptodate(bh);
 135         bh->b_end_io = journal_end_buffer_io_sync;
 136
 137         if (journal->j_flags & JBD2_BARRIER &&
 138             !JBD2_HAS_INCOMPAT_FEATURE(journal,
 139                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 140                 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
 141                 if (ret == -EOPNOTSUPP) {
 142                         printk(KERN_WARNING
 143                                "JBD2: Disabling barriers on %s, "
 144                                "not supported by device\n", journal->j_devname);
 145                         write_lock(&journal->j_state_lock);
 146                         journal->j_flags &= ~JBD2_BARRIER;
 147                         write_unlock(&journal->j_state_lock);
 148
 149                         /* And try again, without the barrier */
 150                         lock_buffer(bh);
 151                         set_buffer_uptodate(bh);
 152                         clear_buffer_dirty(bh);
 153                         ret = submit_bh(WRITE_SYNC_PLUG, bh);
 154                 }
 155         } else {
 156                 ret = submit_bh(WRITE_SYNC_PLUG, bh);
 157         }
 158         *cbh = bh;
 159         return ret;
 160 }
 161
 162 /*
 163  * This function along with journal_submit_commit_record
 164  * allows to write the commit record asynchronously.
 165  */
 166 static int journal_wait_on_commit_record(journal_t *journal,
 167                                          struct buffer_head *bh)
 168 {
 169         int ret = 0;
 170
 171 retry:
 172         clear_buffer_dirty(bh);
 173         wait_on_buffer(bh);
 174         if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
 175                 printk(KERN_WARNING
 176                        "JBD2: %s: disabling barries on %s - not supported "
 177                        "by device\n", __func__, journal->j_devname);
 178                 write_lock(&journal->j_state_lock);
 179                 journal->j_flags &= ~JBD2_BARRIER;
 180                 write_unlock(&journal->j_state_lock);
 181
 182                 lock_buffer(bh);
 183                 clear_buffer_dirty(bh);
 184                 set_buffer_uptodate(bh);
 185                 bh->b_end_io = journal_end_buffer_io_sync;
 186
 187                 ret = submit_bh(WRITE_SYNC_PLUG, bh);
 188                 if (ret) {
 189                         unlock_buffer(bh);
 190                         return ret;
 191                 }
 192                 goto retry;
 193         }
 194
 195         if (unlikely(!buffer_uptodate(bh)))
 196                 ret = -EIO;
 197         put_bh(bh);            /* One for getblk() */
 198         jbd2_journal_put_journal_head(bh2jh(bh));
 199
 200         return ret;
 201 }
 202
 203 /*
 204  * write the filemap data using writepage() address_space_operations.
 205  * We don't do block allocation here even for delalloc. We don't
 206  * use writepages() because with dealyed allocation we may be doing
 207  * block allocation in writepages().
 208  */
 209 static int journal_submit_inode_data_buffers(struct address_space *mapping)
 210 {
 211         int ret;
 212         struct writeback_control wbc = {
 213                 .sync_mode =  WB_SYNC_ALL,
 214                 .nr_to_write = mapping->nrpages * 2,
 215                 .range_start = 0,
 216                 .range_end = i_size_read(mapping->host),
 217         };
 218
 219         ret = generic_writepages(mapping, &wbc);
 220         return ret;
 221 }
 222
 223 /*
 224  * Submit all the data buffers of inode associated with the transaction to
 225  * disk.
 226  *
 227  * We are in a committing transaction. Therefore no new inode can be added to
 228  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 229  * operate on from being released while we write out pages.
 230  */
 231 static int journal_submit_data_buffers(journal_t *journal,
 232                 transaction_t *commit_transaction)
 233 {
 234         struct jbd2_inode *jinode;
 235         int err, ret = 0;
 236         struct address_space *mapping;
 237
 238         spin_lock(&journal->j_list_lock);
 239         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 240                 mapping = jinode->i_vfs_inode->i_mapping;
 241                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 242                 spin_unlock(&journal->j_list_lock);
 243                 /*
 244                  * submit the inode data buffers. We use writepage
 245                  * instead of writepages. Because writepages can do
 246                  * block allocation  with delalloc. We need to write
 247                  * only allocated blocks here.
 248                  */
 249                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 250                 err = journal_submit_inode_data_buffers(mapping);
 251                 if (!ret)
 252                         ret = err;
 253                 spin_lock(&journal->j_list_lock);
 254                 J_ASSERT(jinode->i_transaction == commit_transaction);
 255                 commit_transaction->t_flushed_data_blocks = 1;
 256                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 257                 smp_mb__after_clear_bit();
 258                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 259         }
 260         spin_unlock(&journal->j_list_lock);
 261         return ret;
 262 }
 263
 264 /*
 265  * Wait for data submitted for writeout, refile inodes to proper
 266  * transaction if needed.
 267  *
 268  */
 269 static int journal_finish_inode_data_buffers(journal_t *journal,
 270                 transaction_t *commit_transaction)
 271 {
 272         struct jbd2_inode *jinode, *next_i;
 273         int err, ret = 0;
 274
 275         /* For locking, see the comment in journal_submit_data_buffers() */
 276         spin_lock(&journal->j_list_lock);
 277         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 278                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 279                 spin_unlock(&journal->j_list_lock);
 280                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
 281                 if (err) {
 282                         /*
 283                          * Because AS_EIO is cleared by
 284                          * filemap_fdatawait_range(), set it again so
 285                          * that user process can get -EIO from fsync().
 286                          */
 287                         set_bit(AS_EIO,
 288                                 &jinode->i_vfs_inode->i_mapping->flags);
 289
 290                         if (!ret)
 291                                 ret = err;
 292                 }
 293                 spin_lock(&journal->j_list_lock);
 294                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 295                 smp_mb__after_clear_bit();
 296                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 297         }
 298
 299         /* Now refile inode to proper lists */
 300         list_for_each_entry_safe(jinode, next_i,
 301                                  &commit_transaction->t_inode_list, i_list) {
 302                 list_del(&jinode->i_list);
 303                 if (jinode->i_next_transaction) {
 304                         jinode->i_transaction = jinode->i_next_transaction;
 305                         jinode->i_next_transaction = NULL;
 306                         list_add(&jinode->i_list,
 307                                 &jinode->i_transaction->t_inode_list);
 308                 } else {
 309                         jinode->i_transaction = NULL;
 310                 }
 311         }
 312         spin_unlock(&journal->j_list_lock);
 313
 314         return ret;
 315 }
 316
 317 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 318 {
 319         struct page *page = bh->b_page;
 320         char *addr;
 321         __u32 checksum;
 322
 323         addr = kmap_atomic(page, KM_USER0);
 324         checksum = crc32_be(crc32_sum,
 325                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 326         kunmap_atomic(addr, KM_USER0);
 327
 328         return checksum;
 329 }
 330
 331 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 332                                    unsigned long long block)
 333 {
 334         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 335         if (tag_bytes > JBD2_TAG_SIZE32)
 336                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 337 }
 338
 339 /*
 340  * jbd2_journal_commit_transaction
 341  *
 342  * The primary function for committing a transaction to the log.  This
 343  * function is called by the journal thread to begin a complete commit.
 344  */
 345 void jbd2_journal_commit_transaction(journal_t *journal)
 346 {
 347         struct transaction_stats_s stats;
 348         transaction_t *commit_transaction;
 349         struct journal_head *jh, *new_jh, *descriptor;
 350         struct buffer_head **wbuf = journal->j_wbuf;
 351         int bufs;
 352         int flags;
 353         int err;
 354         unsigned long long blocknr;
 355         ktime_t start_time;
 356         u64 commit_time;
 357         char *tagp = NULL;
 358         journal_header_t *header;
 359         journal_block_tag_t *tag = NULL;
 360         int space_left = 0;
 361         int first_tag = 0;
 362         int tag_flag;
 363         int i, to_free = 0;
 364         int tag_bytes = journal_tag_bytes(journal);
 365         struct buffer_head *cbh = NULL; /* For transactional checksums */
 366         __u32 crc32_sum = ~0;
 367         int write_op = WRITE;
 368
 369         /*
 370          * First job: lock down the current transaction and wait for
 371          * all outstanding updates to complete.
 372          */
 373
 374 #ifdef COMMIT_STATS
 375         spin_lock(&journal->j_list_lock);
 376         summarise_journal_usage(journal);
 377         spin_unlock(&journal->j_list_lock);
 378 #endif
 379
 380         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 381         if (journal->j_flags & JBD2_FLUSHED) {
 382                 jbd_debug(3, "super block updated\n");
 383                 jbd2_journal_update_superblock(journal, 1);
 384         } else {
 385                 jbd_debug(3, "superblock not updated\n");
 386         }
 387
 388         J_ASSERT(journal->j_running_transaction != NULL);
 389         J_ASSERT(journal->j_committing_transaction == NULL);
 390
 391         commit_transaction = journal->j_running_transaction;
 392         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 393
 394         trace_jbd2_start_commit(journal, commit_transaction);
 395         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 396                         commit_transaction->t_tid);
 397
 398         write_lock(&journal->j_state_lock);
 399         commit_transaction->t_state = T_LOCKED;
 400
 401         /*
 402          * Use plugged writes here, since we want to submit several before
 403          * we unplug the device. We don't do explicit unplugging in here,
 404          * instead we rely on sync_buffer() doing the unplug for us.
 405          */
 406         if (commit_transaction->t_synchronous_commit)
 407                 write_op = WRITE_SYNC_PLUG;
 408         trace_jbd2_commit_locking(journal, commit_transaction);
 409         stats.run.rs_wait = commit_transaction->t_max_wait;
 410         stats.run.rs_locked = jiffies;
 411         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 412                                               stats.run.rs_locked);
 413
 414         spin_lock(&commit_transaction->t_handle_lock);
 415         while (atomic_read(&commit_transaction->t_updates)) {
 416                 DEFINE_WAIT(wait);
 417
 418                 prepare_to_wait(&journal->j_wait_updates, &wait,
 419                                         TASK_UNINTERRUPTIBLE);
 420                 if (atomic_read(&commit_transaction->t_updates)) {
 421                         spin_unlock(&commit_transaction->t_handle_lock);
 422                         write_unlock(&journal->j_state_lock);
 423                         schedule();
 424                         write_lock(&journal->j_state_lock);
 425                         spin_lock(&commit_transaction->t_handle_lock);
 426                 }
 427                 finish_wait(&journal->j_wait_updates, &wait);
 428         }
 429         spin_unlock(&commit_transaction->t_handle_lock);
 430
 431         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 432                         journal->j_max_transaction_buffers);
 433
 434         /*
 435          * First thing we are allowed to do is to discard any remaining
 436          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 437          * that there are no such buffers: if a large filesystem
 438          * operation like a truncate needs to split itself over multiple
 439          * transactions, then it may try to do a jbd2_journal_restart() while
 440          * there are still BJ_Reserved buffers outstanding.  These must
 441          * be released cleanly from the current transaction.
 442          *
 443          * In this case, the filesystem must still reserve write access
 444          * again before modifying the buffer in the new transaction, but
 445          * we do not require it to remember exactly which old buffers it
 446          * has reserved.  This is consistent with the existing behaviour
 447          * that multiple jbd2_journal_get_write_access() calls to the same
 448          * buffer are perfectly permissable.
 449          */
 450         while (commit_transaction->t_reserved_list) {
 451                 jh = commit_transaction->t_reserved_list;
 452                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 453                 /*
 454                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 455                  * leave undo-committed data.
 456                  */
 457                 if (jh->b_committed_data) {
 458                         struct buffer_head *bh = jh2bh(jh);
 459
 460                         jbd_lock_bh_state(bh);
 461                         jbd2_free(jh->b_committed_data, bh->b_size);
 462                         jh->b_committed_data = NULL;
 463                         jbd_unlock_bh_state(bh);
 464                 }
 465                 jbd2_journal_refile_buffer(journal, jh);
 466         }
 467
 468         /*
 469          * Now try to drop any written-back buffers from the journal's
 470          * checkpoint lists.  We do this *before* commit because it potentially
 471          * frees some memory
 472          */
 473         spin_lock(&journal->j_list_lock);
 474         __jbd2_journal_clean_checkpoint_list(journal);
 475         spin_unlock(&journal->j_list_lock);
 476
 477         jbd_debug (3, "JBD: commit phase 1\n");
 478
 479         /*
 480          * Switch to a new revoke table.
 481          */
 482         jbd2_journal_switch_revoke_table(journal);
 483
 484         trace_jbd2_commit_flushing(journal, commit_transaction);
 485         stats.run.rs_flushing = jiffies;
 486         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
 487                                              stats.run.rs_flushing);
 488
 489         commit_transaction->t_state = T_FLUSH;
 490         journal->j_committing_transaction = commit_transaction;
 491         journal->j_running_transaction = NULL;
 492         start_time = ktime_get();
 493         commit_transaction->t_log_start = journal->j_head;
 494         wake_up(&journal->j_wait_transaction_locked);
 495         write_unlock(&journal->j_state_lock);
 496
 497         jbd_debug (3, "JBD: commit phase 2\n");
 498
 499         /*
 500          * Now start flushing things to disk, in the order they appear
 501          * on the transaction lists.  Data blocks go first.
 502          */
 503         err = journal_submit_data_buffers(journal, commit_transaction);
 504         if (err)
 505                 jbd2_journal_abort(journal, err);
 506
 507         jbd2_journal_write_revoke_records(journal, commit_transaction,
 508                                           write_op);
 509
 510         jbd_debug(3, "JBD: commit phase 2\n");
 511
 512         /*
 513          * Way to go: we have now written out all of the data for a
 514          * transaction!  Now comes the tricky part: we need to write out
 515          * metadata.  Loop over the transaction's entire buffer list:
 516          */
 517         write_lock(&journal->j_state_lock);
 518         commit_transaction->t_state = T_COMMIT;
 519         write_unlock(&journal->j_state_lock);
 520
 521         trace_jbd2_commit_logging(journal, commit_transaction);
 522         stats.run.rs_logging = jiffies;
 523         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 524                                                stats.run.rs_logging);
 525         stats.run.rs_blocks =
 526                 atomic_read(&commit_transaction->t_outstanding_credits);
 527         stats.run.rs_blocks_logged = 0;
 528
 529         J_ASSERT(commit_transaction->t_nr_buffers <=
 530                  atomic_read(&commit_transaction->t_outstanding_credits));
 531
 532         err = 0;
 533         descriptor = NULL;
 534         bufs = 0;
 535         while (commit_transaction->t_buffers) {
 536
 537                 /* Find the next buffer to be journaled... */
 538
 539                 jh = commit_transaction->t_buffers;
 540
 541                 /* If we're in abort mode, we just un-journal the buffer and
 542                    release it. */
 543
 544                 if (is_journal_aborted(journal)) {
 545                         clear_buffer_jbddirty(jh2bh(jh));
 546                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 547                         jbd2_buffer_abort_trigger(jh,
 548                                                   jh->b_frozen_data ?
 549                                                   jh->b_frozen_triggers :
 550                                                   jh->b_triggers);
 551                         jbd2_journal_refile_buffer(journal, jh);
 552                         /* If that was the last one, we need to clean up
 553                          * any descriptor buffers which may have been
 554                          * already allocated, even if we are now
 555                          * aborting. */
 556                         if (!commit_transaction->t_buffers)
 557                                 goto start_journal_io;
 558                         continue;
 559                 }
 560
 561                 /* Make sure we have a descriptor block in which to
 562                    record the metadata buffer. */
 563
 564                 if (!descriptor) {
 565                         struct buffer_head *bh;
 566
 567                         J_ASSERT (bufs == 0);
 568
 569                         jbd_debug(4, "JBD: get descriptor\n");
 570
 571                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 572                         if (!descriptor) {
 573                                 jbd2_journal_abort(journal, -EIO);
 574                                 continue;
 575                         }
 576
 577                         bh = jh2bh(descriptor);
 578                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 579                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 580                         header = (journal_header_t *)&bh->b_data[0];
 581                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 582                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 583                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 584
 585                         tagp = &bh->b_data[sizeof(journal_header_t)];
 586                         space_left = bh->b_size - sizeof(journal_header_t);
 587                         first_tag = 1;
 588                         set_buffer_jwrite(bh);
 589                         set_buffer_dirty(bh);
 590                         wbuf[bufs++] = bh;
 591
 592                         /* Record it so that we can wait for IO
 593                            completion later */
 594                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 595                         jbd2_journal_file_buffer(descriptor, commit_transaction,
 596                                         BJ_LogCtl);
 597                 }
 598
 599                 /* Where is the buffer to be written? */
 600
 601                 err = jbd2_journal_next_log_block(journal, &blocknr);
 602                 /* If the block mapping failed, just abandon the buffer
 603                    and repeat this loop: we'll fall into the
 604                    refile-on-abort condition above. */
 605                 if (err) {
 606                         jbd2_journal_abort(journal, err);
 607                         continue;
 608                 }
 609
 610                 /*
 611                  * start_this_handle() uses t_outstanding_credits to determine
 612                  * the free space in the log, but this counter is changed
 613                  * by jbd2_journal_next_log_block() also.
 614                  */
 615                 atomic_dec(&commit_transaction->t_outstanding_credits);
 616
 617                 /* Bump b_count to prevent truncate from stumbling over
 618                    the shadowed buffer!  @@@ This can go if we ever get
 619                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 620                 atomic_inc(&jh2bh(jh)->b_count);
 621
 622                 /* Make a temporary IO buffer with which to write it out
 623                    (this will requeue both the metadata buffer and the
 624                    temporary IO buffer). new_bh goes on BJ_IO*/
 625
 626                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 627                 /*
 628                  * akpm: jbd2_journal_write_metadata_buffer() sets
 629                  * new_bh->b_transaction to commit_transaction.
 630                  * We need to clean this up before we release new_bh
 631                  * (which is of type BJ_IO)
 632                  */
 633                 JBUFFER_TRACE(jh, "ph3: write metadata");
 634                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 635                                                       jh, &new_jh, blocknr);
 636                 if (flags < 0) {
 637                         jbd2_journal_abort(journal, flags);
 638                         continue;
 639                 }
 640                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 641                 wbuf[bufs++] = jh2bh(new_jh);
 642
 643                 /* Record the new block's tag in the current descriptor
 644                    buffer */
 645
 646                 tag_flag = 0;
 647                 if (flags & 1)
 648                         tag_flag |= JBD2_FLAG_ESCAPE;
 649                 if (!first_tag)
 650                         tag_flag |= JBD2_FLAG_SAME_UUID;
 651
 652                 tag = (journal_block_tag_t *) tagp;
 653                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 654                 tag->t_flags = cpu_to_be32(tag_flag);
 655                 tagp += tag_bytes;
 656                 space_left -= tag_bytes;
 657
 658                 if (first_tag) {
 659                         memcpy (tagp, journal->j_uuid, 16);
 660                         tagp += 16;
 661                         space_left -= 16;
 662                         first_tag = 0;
 663                 }
 664
 665                 /* If there's no more to do, or if the descriptor is full,
 666                    let the IO rip! */
 667
 668                 if (bufs == journal->j_wbufsize ||
 669                     commit_transaction->t_buffers == NULL ||
 670                     space_left < tag_bytes + 16) {
 671
 672                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 673
 674                         /* Write an end-of-descriptor marker before
 675                            submitting the IOs.  "tag" still points to
 676                            the last tag we set up. */
 677
 678                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 679
 680 start_journal_io:
 681                         for (i = 0; i < bufs; i++) {
 682                                 struct buffer_head *bh = wbuf[i];
 683                                 /*
 684                                  * Compute checksum.
 685                                  */
 686                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
 687                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
 688                                         crc32_sum =
 689                                             jbd2_checksum_data(crc32_sum, bh);
 690                                 }
 691
 692                                 lock_buffer(bh);
 693                                 clear_buffer_dirty(bh);
 694                                 set_buffer_uptodate(bh);
 695                                 bh->b_end_io = journal_end_buffer_io_sync;
 696                                 submit_bh(write_op, bh);
 697                         }
 698                         cond_resched();
 699                         stats.run.rs_blocks_logged += bufs;
 700
 701                         /* Force a new descriptor to be generated next
 702                            time round the loop. */
 703                         descriptor = NULL;
 704                         bufs = 0;
 705                 }
 706         }
 707
 708         /*
 709          * If the journal is not located on the file system device,
 710          * then we must flush the file system device before we issue
 711          * the commit record
 712          */
 713         if (commit_transaction->t_flushed_data_blocks &&
 714             (journal->j_fs_dev != journal->j_dev) &&
 715             (journal->j_flags & JBD2_BARRIER))
 716                 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
 717                         BLKDEV_IFL_WAIT);
 718
 719         /* Done it all: now write the commit record asynchronously. */
 720         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 721                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 722                 err = journal_submit_commit_record(journal, commit_transaction,
 723                                                  &cbh, crc32_sum);
 724                 if (err)
 725                         __jbd2_journal_abort_hard(journal);
 726                 if (journal->j_flags & JBD2_BARRIER)
 727                         blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
 728                                 BLKDEV_IFL_WAIT);
 729         }
 730
 731         err = journal_finish_inode_data_buffers(journal, commit_transaction);
 732         if (err) {
 733                 printk(KERN_WARNING
 734                         "JBD2: Detected IO errors while flushing file data "
 735                        "on %s\n", journal->j_devname);
 736                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 737                         jbd2_journal_abort(journal, err);
 738                 err = 0;
 739         }
 740
 741         /* Lo and behold: we have just managed to send a transaction to
 742            the log.  Before we can commit it, wait for the IO so far to
 743            complete.  Control buffers being written are on the
 744            transaction's t_log_list queue, and metadata buffers are on
 745            the t_iobuf_list queue.
 746
 747            Wait for the buffers in reverse order.  That way we are
 748            less likely to be woken up until all IOs have completed, and
 749            so we incur less scheduling load.
 750         */
 751
 752         jbd_debug(3, "JBD: commit phase 3\n");
 753
 754         /*
 755          * akpm: these are BJ_IO, and j_list_lock is not needed.
 756          * See __journal_try_to_free_buffer.
 757          */
 758 wait_for_iobuf:
 759         while (commit_transaction->t_iobuf_list != NULL) {
 760                 struct buffer_head *bh;
 761
 762                 jh = commit_transaction->t_iobuf_list->b_tprev;
 763                 bh = jh2bh(jh);
 764                 if (buffer_locked(bh)) {
 765                         wait_on_buffer(bh);
 766                         goto wait_for_iobuf;
 767                 }
 768                 if (cond_resched())
 769                         goto wait_for_iobuf;
 770
 771                 if (unlikely(!buffer_uptodate(bh)))
 772                         err = -EIO;
 773
 774                 clear_buffer_jwrite(bh);
 775
 776                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 777                 jbd2_journal_unfile_buffer(journal, jh);
 778
 779                 /*
 780                  * ->t_iobuf_list should contain only dummy buffer_heads
 781                  * which were created by jbd2_journal_write_metadata_buffer().
 782                  */
 783                 BUFFER_TRACE(bh, "dumping temporary bh");
 784                 jbd2_journal_put_journal_head(jh);
 785                 __brelse(bh);
 786                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 787                 free_buffer_head(bh);
 788
 789                 /* We also have to unlock and free the corresponding
 790                    shadowed buffer */
 791                 jh = commit_transaction->t_shadow_list->b_tprev;
 792                 bh = jh2bh(jh);
 793                 clear_bit(BH_JWrite, &bh->b_state);
 794                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 795
 796                 /* The metadata is now released for reuse, but we need
 797                    to remember it against this transaction so that when
 798                    we finally commit, we can do any checkpointing
 799                    required. */
 800                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 801                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 802                 /* Wake up any transactions which were waiting for this
 803                    IO to complete */
 804                 wake_up_bit(&bh->b_state, BH_Unshadow);
 805                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 806                 __brelse(bh);
 807         }
 808
 809         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 810
 811         jbd_debug(3, "JBD: commit phase 4\n");
 812
 813         /* Here we wait for the revoke record and descriptor record buffers */
 814  wait_for_ctlbuf:
 815         while (commit_transaction->t_log_list != NULL) {
 816                 struct buffer_head *bh;
 817
 818                 jh = commit_transaction->t_log_list->b_tprev;
 819                 bh = jh2bh(jh);
 820                 if (buffer_locked(bh)) {
 821                         wait_on_buffer(bh);
 822                         goto wait_for_ctlbuf;
 823                 }
 824                 if (cond_resched())
 825                         goto wait_for_ctlbuf;
 826
 827                 if (unlikely(!buffer_uptodate(bh)))
 828                         err = -EIO;
 829
 830                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 831                 clear_buffer_jwrite(bh);
 832                 jbd2_journal_unfile_buffer(journal, jh);
 833                 jbd2_journal_put_journal_head(jh);
 834                 __brelse(bh);           /* One for getblk */
 835                 /* AKPM: bforget here */
 836         }
 837
 838         if (err)
 839                 jbd2_journal_abort(journal, err);
 840
 841         jbd_debug(3, "JBD: commit phase 5\n");
 842
 843         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 844                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 845                 err = journal_submit_commit_record(journal, commit_transaction,
 846                                                 &cbh, crc32_sum);
 847                 if (err)
 848                         __jbd2_journal_abort_hard(journal);
 849         }
 850         if (!err && !is_journal_aborted(journal))
 851                 err = journal_wait_on_commit_record(journal, cbh);
 852
 853         if (err)
 854                 jbd2_journal_abort(journal, err);
 855
 856         /* End of a transaction!  Finally, we can do checkpoint
 857            processing: any buffers committed as a result of this
 858            transaction can be removed from any checkpoint list it was on
 859            before. */
 860
 861         jbd_debug(3, "JBD: commit phase 6\n");
 862
 863         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 864         J_ASSERT(commit_transaction->t_buffers == NULL);
 865         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 866         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 867         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 868         J_ASSERT(commit_transaction->t_log_list == NULL);
 869
 870 restart_loop:
 871         /*
 872          * As there are other places (journal_unmap_buffer()) adding buffers
 873          * to this list we have to be careful and hold the j_list_lock.
 874          */
 875         spin_lock(&journal->j_list_lock);
 876         while (commit_transaction->t_forget) {
 877                 transaction_t *cp_transaction;
 878                 struct buffer_head *bh;
 879
 880                 jh = commit_transaction->t_forget;
 881                 spin_unlock(&journal->j_list_lock);
 882                 bh = jh2bh(jh);
 883                 jbd_lock_bh_state(bh);
 884                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
 885
 886                 /*
 887                  * If there is undo-protected committed data against
 888                  * this buffer, then we can remove it now.  If it is a
 889                  * buffer needing such protection, the old frozen_data
 890                  * field now points to a committed version of the
 891                  * buffer, so rotate that field to the new committed
 892                  * data.
 893                  *
 894                  * Otherwise, we can just throw away the frozen data now.
 895                  *
 896                  * We also know that the frozen data has already fired
 897                  * its triggers if they exist, so we can clear that too.
 898                  */
 899                 if (jh->b_committed_data) {
 900                         jbd2_free(jh->b_committed_data, bh->b_size);
 901                         jh->b_committed_data = NULL;
 902                         if (jh->b_frozen_data) {
 903                                 jh->b_committed_data = jh->b_frozen_data;
 904                                 jh->b_frozen_data = NULL;
 905                                 jh->b_frozen_triggers = NULL;
 906                         }
 907                 } else if (jh->b_frozen_data) {
 908                         jbd2_free(jh->b_frozen_data, bh->b_size);
 909                         jh->b_frozen_data = NULL;
 910                         jh->b_frozen_triggers = NULL;
 911                 }
 912
 913                 spin_lock(&journal->j_list_lock);
 914                 cp_transaction = jh->b_cp_transaction;
 915                 if (cp_transaction) {
 916                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 917                         cp_transaction->t_chp_stats.cs_dropped++;
 918                         __jbd2_journal_remove_checkpoint(jh);
 919                 }
 920
 921                 /* Only re-checkpoint the buffer_head if it is marked
 922                  * dirty.  If the buffer was added to the BJ_Forget list
 923                  * by jbd2_journal_forget, it may no longer be dirty and
 924                  * there's no point in keeping a checkpoint record for
 925                  * it. */
 926
 927                 /* A buffer which has been freed while still being
 928                  * journaled by a previous transaction may end up still
 929                  * being dirty here, but we want to avoid writing back
 930                  * that buffer in the future after the "add to orphan"
 931                  * operation been committed,  That's not only a performance
 932                  * gain, it also stops aliasing problems if the buffer is
 933                  * left behind for writeback and gets reallocated for another
 934                  * use in a different page. */
 935                 if (buffer_freed(bh) && !jh->b_next_transaction) {
 936                         clear_buffer_freed(bh);
 937                         clear_buffer_jbddirty(bh);
 938                 }
 939
 940                 if (buffer_jbddirty(bh)) {
 941                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 942                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 943                         if (is_journal_aborted(journal))
 944                                 clear_buffer_jbddirty(bh);
 945                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 946                         __jbd2_journal_refile_buffer(jh);
 947                         jbd_unlock_bh_state(bh);
 948                 } else {
 949                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 950                         /* The buffer on BJ_Forget list and not jbddirty means
 951                          * it has been freed by this transaction and hence it
 952                          * could not have been reallocated until this
 953                          * transaction has committed. *BUT* it could be
 954                          * reallocated once we have written all the data to
 955                          * disk and before we process the buffer on BJ_Forget
 956                          * list. */
 957                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 958                         __jbd2_journal_refile_buffer(jh);
 959                         if (!jh->b_transaction) {
 960                                 jbd_unlock_bh_state(bh);
 961                                  /* needs a brelse */
 962                                 jbd2_journal_remove_journal_head(bh);
 963                                 release_buffer_page(bh);
 964                         } else
 965                                 jbd_unlock_bh_state(bh);
 966                 }
 967                 cond_resched_lock(&journal->j_list_lock);
 968         }
 969         spin_unlock(&journal->j_list_lock);
 970         /*
 971          * This is a bit sleazy.  We use j_list_lock to protect transition
 972          * of a transaction into T_FINISHED state and calling
 973          * __jbd2_journal_drop_transaction(). Otherwise we could race with
 974          * other checkpointing code processing the transaction...
 975          */
 976         write_lock(&journal->j_state_lock);
 977         spin_lock(&journal->j_list_lock);
 978         /*
 979          * Now recheck if some buffers did not get attached to the transaction
 980          * while the lock was dropped...
 981          */
 982         if (commit_transaction->t_forget) {
 983                 spin_unlock(&journal->j_list_lock);
 984                 write_unlock(&journal->j_state_lock);
 985                 goto restart_loop;
 986         }
 987
 988         /* Done with this transaction! */
 989
 990         jbd_debug(3, "JBD: commit phase 7\n");
 991
 992         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 993
 994         commit_transaction->t_start = jiffies;
 995         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
 996                                               commit_transaction->t_start);
 997
 998         /*
 999          * File the transaction statistics
1000          */
1001         stats.ts_tid = commit_transaction->t_tid;
1002         stats.run.rs_handle_count =
1003                 atomic_read(&commit_transaction->t_handle_count);
1004         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1005                              commit_transaction->t_tid, &stats.run);
1006
1007         /*
1008          * Calculate overall stats
1009          */
1010         spin_lock(&journal->j_history_lock);
1011         journal->j_stats.ts_tid++;
1012         journal->j_stats.run.rs_wait += stats.run.rs_wait;
1013         journal->j_stats.run.rs_running += stats.run.rs_running;
1014         journal->j_stats.run.rs_locked += stats.run.rs_locked;
1015         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1016         journal->j_stats.run.rs_logging += stats.run.rs_logging;
1017         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1018         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1019         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1020         spin_unlock(&journal->j_history_lock);
1021
1022         commit_transaction->t_state = T_FINISHED;
1023         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1024         journal->j_commit_sequence = commit_transaction->t_tid;
1025         journal->j_committing_transaction = NULL;
1026         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1027
1028         /*
1029          * weight the commit time higher than the average time so we don't
1030          * react too strongly to vast changes in the commit time
1031          */
1032         if (likely(journal->j_average_commit_time))
1033                 journal->j_average_commit_time = (commit_time +
1034                                 journal->j_average_commit_time*3) / 4;
1035         else
1036                 journal->j_average_commit_time = commit_time;
1037         write_unlock(&journal->j_state_lock);
1038
1039         if (commit_transaction->t_checkpoint_list == NULL &&
1040             commit_transaction->t_checkpoint_io_list == NULL) {
1041                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1042                 to_free = 1;
1043         } else {
1044                 if (journal->j_checkpoint_transactions == NULL) {
1045                         journal->j_checkpoint_transactions = commit_transaction;
1046                         commit_transaction->t_cpnext = commit_transaction;
1047                         commit_transaction->t_cpprev = commit_transaction;
1048                 } else {
1049                         commit_transaction->t_cpnext =
1050                                 journal->j_checkpoint_transactions;
1051                         commit_transaction->t_cpprev =
1052                                 commit_transaction->t_cpnext->t_cpprev;
1053                         commit_transaction->t_cpnext->t_cpprev =
1054                                 commit_transaction;
1055                         commit_transaction->t_cpprev->t_cpnext =
1056                                 commit_transaction;
1057                 }
1058         }
1059         spin_unlock(&journal->j_list_lock);
1060
1061         if (journal->j_commit_callback)
1062                 journal->j_commit_callback(journal, commit_transaction);
1063
1064         trace_jbd2_end_commit(journal, commit_transaction);
1065         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1066                   journal->j_commit_sequence, journal->j_tail_sequence);
1067         if (to_free)
1068                 kfree(commit_transaction);
1069
1070         wake_up(&journal->j_wait_done_commit);
1071 }