fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/crc32.h>
  25 #include <linux/writeback.h>
  26 #include <linux/backing-dev.h>
  27 #include <linux/bio.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/bitops.h>
  30 #include <trace/events/jbd2.h>
  31 #include <asm/system.h>
  32
  33 /*
  34  * Default IO end handler for temporary BJ_IO buffer_heads.
  35  */
  36 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  37 {
  38         BUFFER_TRACE(bh, "");
  39         if (uptodate)
  40                 set_buffer_uptodate(bh);
  41         else
  42                 clear_buffer_uptodate(bh);
  43         unlock_buffer(bh);
  44 }
  45
  46 /*
  47  * When an ext4 file is truncated, it is possible that some pages are not
  48  * successfully freed, because they are attached to a committing transaction.
  49  * After the transaction commits, these pages are left on the LRU, with no
  50  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  51  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  52  * the numbers in /proc/meminfo look odd.
  53  *
  54  * So here, we have a buffer which has just come off the forget list.  Look to
  55  * see if we can strip all buffers from the backing page.
  56  *
  57  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  58  * caller provided us with a ref against the buffer, and we drop that here.
  59  */
  60 static void release_buffer_page(struct buffer_head *bh)
  61 {
  62         struct page *page;
  63
  64         if (buffer_dirty(bh))
  65                 goto nope;
  66         if (atomic_read(&bh->b_count) != 1)
  67                 goto nope;
  68         page = bh->b_page;
  69         if (!page)
  70                 goto nope;
  71         if (page->mapping)
  72                 goto nope;
  73
  74         /* OK, it's a truncated page */
  75         if (!trylock_page(page))
  76                 goto nope;
  77
  78         page_cache_get(page);
  79         __brelse(bh);
  80         try_to_free_buffers(page);
  81         unlock_page(page);
  82         page_cache_release(page);
  83         return;
  84
  85 nope:
  86         __brelse(bh);
  87 }
  88
  89 /*
  90  * Done it all: now submit the commit record.  We should have
  91  * cleaned up our previous buffers by now, so if we are in abort
  92  * mode we can now just skip the rest of the journal write
  93  * entirely.
  94  *
  95  * Returns 1 if the journal needs to be aborted or 0 on success
  96  */
  97 static int journal_submit_commit_record(journal_t *journal,
  98                                         transaction_t *commit_transaction,
  99                                         struct buffer_head **cbh,
 100                                         __u32 crc32_sum)
 101 {
 102         struct journal_head *descriptor;
 103         struct commit_header *tmp;
 104         struct buffer_head *bh;
 105         int ret;
 106         struct timespec now = current_kernel_time();
 107
 108         *cbh = NULL;
 109
 110         if (is_journal_aborted(journal))
 111                 return 0;
 112
 113         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 114         if (!descriptor)
 115                 return 1;
 116
 117         bh = jh2bh(descriptor);
 118
 119         tmp = (struct commit_header *)bh->b_data;
 120         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 121         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 122         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 123         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 124         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 125
 126         if (JBD2_HAS_COMPAT_FEATURE(journal,
 127                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
 128                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 129                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 130                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 131         }
 132
 133         JBUFFER_TRACE(descriptor, "submit commit block");
 134         lock_buffer(bh);
 135         clear_buffer_dirty(bh);
 136         set_buffer_uptodate(bh);
 137         bh->b_end_io = journal_end_buffer_io_sync;
 138
 139         if (journal->j_flags & JBD2_BARRIER &&
 140             !JBD2_HAS_INCOMPAT_FEATURE(journal,
 141                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
 142                 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
 143         else
 144                 ret = submit_bh(WRITE_SYNC, bh);
 145
 146         *cbh = bh;
 147         return ret;
 148 }
 149
 150 /*
 151  * This function along with journal_submit_commit_record
 152  * allows to write the commit record asynchronously.
 153  */
 154 static int journal_wait_on_commit_record(journal_t *journal,
 155                                          struct buffer_head *bh)
 156 {
 157         int ret = 0;
 158
 159         clear_buffer_dirty(bh);
 160         wait_on_buffer(bh);
 161
 162         if (unlikely(!buffer_uptodate(bh)))
 163                 ret = -EIO;
 164         put_bh(bh);            /* One for getblk() */
 165         jbd2_journal_put_journal_head(bh2jh(bh));
 166
 167         return ret;
 168 }
 169
 170 /*
 171  * write the filemap data using writepage() address_space_operations.
 172  * We don't do block allocation here even for delalloc. We don't
 173  * use writepages() because with dealyed allocation we may be doing
 174  * block allocation in writepages().
 175  */
 176 static int journal_submit_inode_data_buffers(struct address_space *mapping)
 177 {
 178         int ret;
 179         struct writeback_control wbc = {
 180                 .sync_mode =  WB_SYNC_ALL,
 181                 .nr_to_write = mapping->nrpages * 2,
 182                 .range_start = 0,
 183                 .range_end = i_size_read(mapping->host),
 184         };
 185
 186         ret = generic_writepages(mapping, &wbc);
 187         return ret;
 188 }
 189
 190 /*
 191  * Submit all the data buffers of inode associated with the transaction to
 192  * disk.
 193  *
 194  * We are in a committing transaction. Therefore no new inode can be added to
 195  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 196  * operate on from being released while we write out pages.
 197  */
 198 static int journal_submit_data_buffers(journal_t *journal,
 199                 transaction_t *commit_transaction)
 200 {
 201         struct jbd2_inode *jinode;
 202         int err, ret = 0;
 203         struct address_space *mapping;
 204
 205         spin_lock(&journal->j_list_lock);
 206         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 207                 mapping = jinode->i_vfs_inode->i_mapping;
 208                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 209                 spin_unlock(&journal->j_list_lock);
 210                 /*
 211                  * submit the inode data buffers. We use writepage
 212                  * instead of writepages. Because writepages can do
 213                  * block allocation  with delalloc. We need to write
 214                  * only allocated blocks here.
 215                  */
 216                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 217                 err = journal_submit_inode_data_buffers(mapping);
 218                 if (!ret)
 219                         ret = err;
 220                 spin_lock(&journal->j_list_lock);
 221                 J_ASSERT(jinode->i_transaction == commit_transaction);
 222                 commit_transaction->t_flushed_data_blocks = 1;
 223                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 224                 smp_mb__after_clear_bit();
 225                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 226         }
 227         spin_unlock(&journal->j_list_lock);
 228         return ret;
 229 }
 230
 231 /*
 232  * Wait for data submitted for writeout, refile inodes to proper
 233  * transaction if needed.
 234  *
 235  */
 236 static int journal_finish_inode_data_buffers(journal_t *journal,
 237                 transaction_t *commit_transaction)
 238 {
 239         struct jbd2_inode *jinode, *next_i;
 240         int err, ret = 0;
 241
 242         /* For locking, see the comment in journal_submit_data_buffers() */
 243         spin_lock(&journal->j_list_lock);
 244         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 245                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 246                 spin_unlock(&journal->j_list_lock);
 247                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
 248                 if (err) {
 249                         /*
 250                          * Because AS_EIO is cleared by
 251                          * filemap_fdatawait_range(), set it again so
 252                          * that user process can get -EIO from fsync().
 253                          */
 254                         set_bit(AS_EIO,
 255                                 &jinode->i_vfs_inode->i_mapping->flags);
 256
 257                         if (!ret)
 258                                 ret = err;
 259                 }
 260                 spin_lock(&journal->j_list_lock);
 261                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 262                 smp_mb__after_clear_bit();
 263                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 264         }
 265
 266         /* Now refile inode to proper lists */
 267         list_for_each_entry_safe(jinode, next_i,
 268                                  &commit_transaction->t_inode_list, i_list) {
 269                 list_del(&jinode->i_list);
 270                 if (jinode->i_next_transaction) {
 271                         jinode->i_transaction = jinode->i_next_transaction;
 272                         jinode->i_next_transaction = NULL;
 273                         list_add(&jinode->i_list,
 274                                 &jinode->i_transaction->t_inode_list);
 275                 } else {
 276                         jinode->i_transaction = NULL;
 277                 }
 278         }
 279         spin_unlock(&journal->j_list_lock);
 280
 281         return ret;
 282 }
 283
 284 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 285 {
 286         struct page *page = bh->b_page;
 287         char *addr;
 288         __u32 checksum;
 289
 290         addr = kmap_atomic(page, KM_USER0);
 291         checksum = crc32_be(crc32_sum,
 292                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 293         kunmap_atomic(addr, KM_USER0);
 294
 295         return checksum;
 296 }
 297
 298 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 299                                    unsigned long long block)
 300 {
 301         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 302         if (tag_bytes > JBD2_TAG_SIZE32)
 303                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 304 }
 305
 306 /*
 307  * jbd2_journal_commit_transaction
 308  *
 309  * The primary function for committing a transaction to the log.  This
 310  * function is called by the journal thread to begin a complete commit.
 311  */
 312 void jbd2_journal_commit_transaction(journal_t *journal)
 313 {
 314         struct transaction_stats_s stats;
 315         transaction_t *commit_transaction;
 316         struct journal_head *jh, *new_jh, *descriptor;
 317         struct buffer_head **wbuf = journal->j_wbuf;
 318         int bufs;
 319         int flags;
 320         int err;
 321         unsigned long long blocknr;
 322         ktime_t start_time;
 323         u64 commit_time;
 324         char *tagp = NULL;
 325         journal_header_t *header;
 326         journal_block_tag_t *tag = NULL;
 327         int space_left = 0;
 328         int first_tag = 0;
 329         int tag_flag;
 330         int i, to_free = 0;
 331         int tag_bytes = journal_tag_bytes(journal);
 332         struct buffer_head *cbh = NULL; /* For transactional checksums */
 333         __u32 crc32_sum = ~0;
 334         struct blk_plug plug;
 335
 336         /*
 337          * First job: lock down the current transaction and wait for
 338          * all outstanding updates to complete.
 339          */
 340
 341         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 342         if (journal->j_flags & JBD2_FLUSHED) {
 343                 jbd_debug(3, "super block updated\n");
 344                 jbd2_journal_update_superblock(journal, 1);
 345         } else {
 346                 jbd_debug(3, "superblock not updated\n");
 347         }
 348
 349         J_ASSERT(journal->j_running_transaction != NULL);
 350         J_ASSERT(journal->j_committing_transaction == NULL);
 351
 352         commit_transaction = journal->j_running_transaction;
 353         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 354
 355         trace_jbd2_start_commit(journal, commit_transaction);
 356         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 357                         commit_transaction->t_tid);
 358
 359         write_lock(&journal->j_state_lock);
 360         commit_transaction->t_state = T_LOCKED;
 361
 362         trace_jbd2_commit_locking(journal, commit_transaction);
 363         stats.run.rs_wait = commit_transaction->t_max_wait;
 364         stats.run.rs_locked = jiffies;
 365         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 366                                               stats.run.rs_locked);
 367
 368         spin_lock(&commit_transaction->t_handle_lock);
 369         while (atomic_read(&commit_transaction->t_updates)) {
 370                 DEFINE_WAIT(wait);
 371
 372                 prepare_to_wait(&journal->j_wait_updates, &wait,
 373                                         TASK_UNINTERRUPTIBLE);
 374                 if (atomic_read(&commit_transaction->t_updates)) {
 375                         spin_unlock(&commit_transaction->t_handle_lock);
 376                         write_unlock(&journal->j_state_lock);
 377                         schedule();
 378                         write_lock(&journal->j_state_lock);
 379                         spin_lock(&commit_transaction->t_handle_lock);
 380                 }
 381                 finish_wait(&journal->j_wait_updates, &wait);
 382         }
 383         spin_unlock(&commit_transaction->t_handle_lock);
 384
 385         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 386                         journal->j_max_transaction_buffers);
 387
 388         /*
 389          * First thing we are allowed to do is to discard any remaining
 390          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 391          * that there are no such buffers: if a large filesystem
 392          * operation like a truncate needs to split itself over multiple
 393          * transactions, then it may try to do a jbd2_journal_restart() while
 394          * there are still BJ_Reserved buffers outstanding.  These must
 395          * be released cleanly from the current transaction.
 396          *
 397          * In this case, the filesystem must still reserve write access
 398          * again before modifying the buffer in the new transaction, but
 399          * we do not require it to remember exactly which old buffers it
 400          * has reserved.  This is consistent with the existing behaviour
 401          * that multiple jbd2_journal_get_write_access() calls to the same
 402          * buffer are perfectly permissible.
 403          */
 404         while (commit_transaction->t_reserved_list) {
 405                 jh = commit_transaction->t_reserved_list;
 406                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 407                 /*
 408                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 409                  * leave undo-committed data.
 410                  */
 411                 if (jh->b_committed_data) {
 412                         struct buffer_head *bh = jh2bh(jh);
 413
 414                         jbd_lock_bh_state(bh);
 415                         jbd2_free(jh->b_committed_data, bh->b_size);
 416                         jh->b_committed_data = NULL;
 417                         jbd_unlock_bh_state(bh);
 418                 }
 419                 jbd2_journal_refile_buffer(journal, jh);
 420         }
 421
 422         /*
 423          * Now try to drop any written-back buffers from the journal's
 424          * checkpoint lists.  We do this *before* commit because it potentially
 425          * frees some memory
 426          */
 427         spin_lock(&journal->j_list_lock);
 428         __jbd2_journal_clean_checkpoint_list(journal);
 429         spin_unlock(&journal->j_list_lock);
 430
 431         jbd_debug (3, "JBD: commit phase 1\n");
 432
 433         /*
 434          * Switch to a new revoke table.
 435          */
 436         jbd2_journal_switch_revoke_table(journal);
 437
 438         trace_jbd2_commit_flushing(journal, commit_transaction);
 439         stats.run.rs_flushing = jiffies;
 440         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
 441                                              stats.run.rs_flushing);
 442
 443         commit_transaction->t_state = T_FLUSH;
 444         journal->j_committing_transaction = commit_transaction;
 445         journal->j_running_transaction = NULL;
 446         start_time = ktime_get();
 447         commit_transaction->t_log_start = journal->j_head;
 448         wake_up(&journal->j_wait_transaction_locked);
 449         write_unlock(&journal->j_state_lock);
 450
 451         jbd_debug (3, "JBD: commit phase 2\n");
 452
 453         /*
 454          * Now start flushing things to disk, in the order they appear
 455          * on the transaction lists.  Data blocks go first.
 456          */
 457         err = journal_submit_data_buffers(journal, commit_transaction);
 458         if (err)
 459                 jbd2_journal_abort(journal, err);
 460
 461         blk_start_plug(&plug);
 462         jbd2_journal_write_revoke_records(journal, commit_transaction,
 463                                           WRITE_SYNC);
 464         blk_finish_plug(&plug);
 465
 466         jbd_debug(3, "JBD: commit phase 2\n");
 467
 468         /*
 469          * Way to go: we have now written out all of the data for a
 470          * transaction!  Now comes the tricky part: we need to write out
 471          * metadata.  Loop over the transaction's entire buffer list:
 472          */
 473         write_lock(&journal->j_state_lock);
 474         commit_transaction->t_state = T_COMMIT;
 475         write_unlock(&journal->j_state_lock);
 476
 477         trace_jbd2_commit_logging(journal, commit_transaction);
 478         stats.run.rs_logging = jiffies;
 479         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 480                                                stats.run.rs_logging);
 481         stats.run.rs_blocks =
 482                 atomic_read(&commit_transaction->t_outstanding_credits);
 483         stats.run.rs_blocks_logged = 0;
 484
 485         J_ASSERT(commit_transaction->t_nr_buffers <=
 486                  atomic_read(&commit_transaction->t_outstanding_credits));
 487
 488         err = 0;
 489         descriptor = NULL;
 490         bufs = 0;
 491         blk_start_plug(&plug);
 492         while (commit_transaction->t_buffers) {
 493
 494                 /* Find the next buffer to be journaled... */
 495
 496                 jh = commit_transaction->t_buffers;
 497
 498                 /* If we're in abort mode, we just un-journal the buffer and
 499                    release it. */
 500
 501                 if (is_journal_aborted(journal)) {
 502                         clear_buffer_jbddirty(jh2bh(jh));
 503                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 504                         jbd2_buffer_abort_trigger(jh,
 505                                                   jh->b_frozen_data ?
 506                                                   jh->b_frozen_triggers :
 507                                                   jh->b_triggers);
 508                         jbd2_journal_refile_buffer(journal, jh);
 509                         /* If that was the last one, we need to clean up
 510                          * any descriptor buffers which may have been
 511                          * already allocated, even if we are now
 512                          * aborting. */
 513                         if (!commit_transaction->t_buffers)
 514                                 goto start_journal_io;
 515                         continue;
 516                 }
 517
 518                 /* Make sure we have a descriptor block in which to
 519                    record the metadata buffer. */
 520
 521                 if (!descriptor) {
 522                         struct buffer_head *bh;
 523
 524                         J_ASSERT (bufs == 0);
 525
 526                         jbd_debug(4, "JBD: get descriptor\n");
 527
 528                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 529                         if (!descriptor) {
 530                                 jbd2_journal_abort(journal, -EIO);
 531                                 continue;
 532                         }
 533
 534                         bh = jh2bh(descriptor);
 535                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 536                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 537                         header = (journal_header_t *)&bh->b_data[0];
 538                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 539                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 540                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 541
 542                         tagp = &bh->b_data[sizeof(journal_header_t)];
 543                         space_left = bh->b_size - sizeof(journal_header_t);
 544                         first_tag = 1;
 545                         set_buffer_jwrite(bh);
 546                         set_buffer_dirty(bh);
 547                         wbuf[bufs++] = bh;
 548
 549                         /* Record it so that we can wait for IO
 550                            completion later */
 551                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 552                         jbd2_journal_file_buffer(descriptor, commit_transaction,
 553                                         BJ_LogCtl);
 554                 }
 555
 556                 /* Where is the buffer to be written? */
 557
 558                 err = jbd2_journal_next_log_block(journal, &blocknr);
 559                 /* If the block mapping failed, just abandon the buffer
 560                    and repeat this loop: we'll fall into the
 561                    refile-on-abort condition above. */
 562                 if (err) {
 563                         jbd2_journal_abort(journal, err);
 564                         continue;
 565                 }
 566
 567                 /*
 568                  * start_this_handle() uses t_outstanding_credits to determine
 569                  * the free space in the log, but this counter is changed
 570                  * by jbd2_journal_next_log_block() also.
 571                  */
 572                 atomic_dec(&commit_transaction->t_outstanding_credits);
 573
 574                 /* Bump b_count to prevent truncate from stumbling over
 575                    the shadowed buffer!  @@@ This can go if we ever get
 576                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 577                 atomic_inc(&jh2bh(jh)->b_count);
 578
 579                 /* Make a temporary IO buffer with which to write it out
 580                    (this will requeue both the metadata buffer and the
 581                    temporary IO buffer). new_bh goes on BJ_IO*/
 582
 583                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 584                 /*
 585                  * akpm: jbd2_journal_write_metadata_buffer() sets
 586                  * new_bh->b_transaction to commit_transaction.
 587                  * We need to clean this up before we release new_bh
 588                  * (which is of type BJ_IO)
 589                  */
 590                 JBUFFER_TRACE(jh, "ph3: write metadata");
 591                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 592                                                       jh, &new_jh, blocknr);
 593                 if (flags < 0) {
 594                         jbd2_journal_abort(journal, flags);
 595                         continue;
 596                 }
 597                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 598                 wbuf[bufs++] = jh2bh(new_jh);
 599
 600                 /* Record the new block's tag in the current descriptor
 601                    buffer */
 602
 603                 tag_flag = 0;
 604                 if (flags & 1)
 605                         tag_flag |= JBD2_FLAG_ESCAPE;
 606                 if (!first_tag)
 607                         tag_flag |= JBD2_FLAG_SAME_UUID;
 608
 609                 tag = (journal_block_tag_t *) tagp;
 610                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 611                 tag->t_flags = cpu_to_be32(tag_flag);
 612                 tagp += tag_bytes;
 613                 space_left -= tag_bytes;
 614
 615                 if (first_tag) {
 616                         memcpy (tagp, journal->j_uuid, 16);
 617                         tagp += 16;
 618                         space_left -= 16;
 619                         first_tag = 0;
 620                 }
 621
 622                 /* If there's no more to do, or if the descriptor is full,
 623                    let the IO rip! */
 624
 625                 if (bufs == journal->j_wbufsize ||
 626                     commit_transaction->t_buffers == NULL ||
 627                     space_left < tag_bytes + 16) {
 628
 629                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 630
 631                         /* Write an end-of-descriptor marker before
 632                            submitting the IOs.  "tag" still points to
 633                            the last tag we set up. */
 634
 635                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 636
 637 start_journal_io:
 638                         for (i = 0; i < bufs; i++) {
 639                                 struct buffer_head *bh = wbuf[i];
 640                                 /*
 641                                  * Compute checksum.
 642                                  */
 643                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
 644                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
 645                                         crc32_sum =
 646                                             jbd2_checksum_data(crc32_sum, bh);
 647                                 }
 648
 649                                 lock_buffer(bh);
 650                                 clear_buffer_dirty(bh);
 651                                 set_buffer_uptodate(bh);
 652                                 bh->b_end_io = journal_end_buffer_io_sync;
 653                                 submit_bh(WRITE_SYNC, bh);
 654                         }
 655                         cond_resched();
 656                         stats.run.rs_blocks_logged += bufs;
 657
 658                         /* Force a new descriptor to be generated next
 659                            time round the loop. */
 660                         descriptor = NULL;
 661                         bufs = 0;
 662                 }
 663         }
 664
 665         err = journal_finish_inode_data_buffers(journal, commit_transaction);
 666         if (err) {
 667                 printk(KERN_WARNING
 668                         "JBD2: Detected IO errors while flushing file data "
 669                        "on %s\n", journal->j_devname);
 670                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 671                         jbd2_journal_abort(journal, err);
 672                 err = 0;
 673         }
 674
 675         /*
 676          * If the journal is not located on the file system device,
 677          * then we must flush the file system device before we issue
 678          * the commit record
 679          */
 680         if (commit_transaction->t_flushed_data_blocks &&
 681             (journal->j_fs_dev != journal->j_dev) &&
 682             (journal->j_flags & JBD2_BARRIER))
 683                 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 684
 685         /* Done it all: now write the commit record asynchronously. */
 686         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 687                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 688                 err = journal_submit_commit_record(journal, commit_transaction,
 689                                                  &cbh, crc32_sum);
 690                 if (err)
 691                         __jbd2_journal_abort_hard(journal);
 692         }
 693
 694         blk_finish_plug(&plug);
 695
 696         /* Lo and behold: we have just managed to send a transaction to
 697            the log.  Before we can commit it, wait for the IO so far to
 698            complete.  Control buffers being written are on the
 699            transaction's t_log_list queue, and metadata buffers are on
 700            the t_iobuf_list queue.
 701
 702            Wait for the buffers in reverse order.  That way we are
 703            less likely to be woken up until all IOs have completed, and
 704            so we incur less scheduling load.
 705         */
 706
 707         jbd_debug(3, "JBD: commit phase 3\n");
 708
 709         /*
 710          * akpm: these are BJ_IO, and j_list_lock is not needed.
 711          * See __journal_try_to_free_buffer.
 712          */
 713 wait_for_iobuf:
 714         while (commit_transaction->t_iobuf_list != NULL) {
 715                 struct buffer_head *bh;
 716
 717                 jh = commit_transaction->t_iobuf_list->b_tprev;
 718                 bh = jh2bh(jh);
 719                 if (buffer_locked(bh)) {
 720                         wait_on_buffer(bh);
 721                         goto wait_for_iobuf;
 722                 }
 723                 if (cond_resched())
 724                         goto wait_for_iobuf;
 725
 726                 if (unlikely(!buffer_uptodate(bh)))
 727                         err = -EIO;
 728
 729                 clear_buffer_jwrite(bh);
 730
 731                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 732                 jbd2_journal_unfile_buffer(journal, jh);
 733
 734                 /*
 735                  * ->t_iobuf_list should contain only dummy buffer_heads
 736                  * which were created by jbd2_journal_write_metadata_buffer().
 737                  */
 738                 BUFFER_TRACE(bh, "dumping temporary bh");
 739                 jbd2_journal_put_journal_head(jh);
 740                 __brelse(bh);
 741                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 742                 free_buffer_head(bh);
 743
 744                 /* We also have to unlock and free the corresponding
 745                    shadowed buffer */
 746                 jh = commit_transaction->t_shadow_list->b_tprev;
 747                 bh = jh2bh(jh);
 748                 clear_bit(BH_JWrite, &bh->b_state);
 749                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 750
 751                 /* The metadata is now released for reuse, but we need
 752                    to remember it against this transaction so that when
 753                    we finally commit, we can do any checkpointing
 754                    required. */
 755                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 756                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 757                 /* Wake up any transactions which were waiting for this
 758                    IO to complete */
 759                 wake_up_bit(&bh->b_state, BH_Unshadow);
 760                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 761                 __brelse(bh);
 762         }
 763
 764         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 765
 766         jbd_debug(3, "JBD: commit phase 4\n");
 767
 768         /* Here we wait for the revoke record and descriptor record buffers */
 769  wait_for_ctlbuf:
 770         while (commit_transaction->t_log_list != NULL) {
 771                 struct buffer_head *bh;
 772
 773                 jh = commit_transaction->t_log_list->b_tprev;
 774                 bh = jh2bh(jh);
 775                 if (buffer_locked(bh)) {
 776                         wait_on_buffer(bh);
 777                         goto wait_for_ctlbuf;
 778                 }
 779                 if (cond_resched())
 780                         goto wait_for_ctlbuf;
 781
 782                 if (unlikely(!buffer_uptodate(bh)))
 783                         err = -EIO;
 784
 785                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 786                 clear_buffer_jwrite(bh);
 787                 jbd2_journal_unfile_buffer(journal, jh);
 788                 jbd2_journal_put_journal_head(jh);
 789                 __brelse(bh);           /* One for getblk */
 790                 /* AKPM: bforget here */
 791         }
 792
 793         if (err)
 794                 jbd2_journal_abort(journal, err);
 795
 796         jbd_debug(3, "JBD: commit phase 5\n");
 797
 798         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 799                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 800                 err = journal_submit_commit_record(journal, commit_transaction,
 801                                                 &cbh, crc32_sum);
 802                 if (err)
 803                         __jbd2_journal_abort_hard(journal);
 804         }
 805         if (cbh)
 806                 err = journal_wait_on_commit_record(journal, cbh);
 807         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 808                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
 809             journal->j_flags & JBD2_BARRIER) {
 810                 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
 811         }
 812
 813         if (err)
 814                 jbd2_journal_abort(journal, err);
 815
 816         /* End of a transaction!  Finally, we can do checkpoint
 817            processing: any buffers committed as a result of this
 818            transaction can be removed from any checkpoint list it was on
 819            before. */
 820
 821         jbd_debug(3, "JBD: commit phase 6\n");
 822
 823         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 824         J_ASSERT(commit_transaction->t_buffers == NULL);
 825         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 826         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 827         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 828         J_ASSERT(commit_transaction->t_log_list == NULL);
 829
 830 restart_loop:
 831         /*
 832          * As there are other places (journal_unmap_buffer()) adding buffers
 833          * to this list we have to be careful and hold the j_list_lock.
 834          */
 835         spin_lock(&journal->j_list_lock);
 836         while (commit_transaction->t_forget) {
 837                 transaction_t *cp_transaction;
 838                 struct buffer_head *bh;
 839
 840                 jh = commit_transaction->t_forget;
 841                 spin_unlock(&journal->j_list_lock);
 842                 bh = jh2bh(jh);
 843                 jbd_lock_bh_state(bh);
 844                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
 845
 846                 /*
 847                  * If there is undo-protected committed data against
 848                  * this buffer, then we can remove it now.  If it is a
 849                  * buffer needing such protection, the old frozen_data
 850                  * field now points to a committed version of the
 851                  * buffer, so rotate that field to the new committed
 852                  * data.
 853                  *
 854                  * Otherwise, we can just throw away the frozen data now.
 855                  *
 856                  * We also know that the frozen data has already fired
 857                  * its triggers if they exist, so we can clear that too.
 858                  */
 859                 if (jh->b_committed_data) {
 860                         jbd2_free(jh->b_committed_data, bh->b_size);
 861                         jh->b_committed_data = NULL;
 862                         if (jh->b_frozen_data) {
 863                                 jh->b_committed_data = jh->b_frozen_data;
 864                                 jh->b_frozen_data = NULL;
 865                                 jh->b_frozen_triggers = NULL;
 866                         }
 867                 } else if (jh->b_frozen_data) {
 868                         jbd2_free(jh->b_frozen_data, bh->b_size);
 869                         jh->b_frozen_data = NULL;
 870                         jh->b_frozen_triggers = NULL;
 871                 }
 872
 873                 spin_lock(&journal->j_list_lock);
 874                 cp_transaction = jh->b_cp_transaction;
 875                 if (cp_transaction) {
 876                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 877                         cp_transaction->t_chp_stats.cs_dropped++;
 878                         __jbd2_journal_remove_checkpoint(jh);
 879                 }
 880
 881                 /* Only re-checkpoint the buffer_head if it is marked
 882                  * dirty.  If the buffer was added to the BJ_Forget list
 883                  * by jbd2_journal_forget, it may no longer be dirty and
 884                  * there's no point in keeping a checkpoint record for
 885                  * it. */
 886
 887                 /* A buffer which has been freed while still being
 888                  * journaled by a previous transaction may end up still
 889                  * being dirty here, but we want to avoid writing back
 890                  * that buffer in the future after the "add to orphan"
 891                  * operation been committed,  That's not only a performance
 892                  * gain, it also stops aliasing problems if the buffer is
 893                  * left behind for writeback and gets reallocated for another
 894                  * use in a different page. */
 895                 if (buffer_freed(bh) && !jh->b_next_transaction) {
 896                         clear_buffer_freed(bh);
 897                         clear_buffer_jbddirty(bh);
 898                 }
 899
 900                 if (buffer_jbddirty(bh)) {
 901                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 902                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 903                         if (is_journal_aborted(journal))
 904                                 clear_buffer_jbddirty(bh);
 905                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 906                         __jbd2_journal_refile_buffer(jh);
 907                         jbd_unlock_bh_state(bh);
 908                 } else {
 909                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 910                         /* The buffer on BJ_Forget list and not jbddirty means
 911                          * it has been freed by this transaction and hence it
 912                          * could not have been reallocated until this
 913                          * transaction has committed. *BUT* it could be
 914                          * reallocated once we have written all the data to
 915                          * disk and before we process the buffer on BJ_Forget
 916                          * list. */
 917                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 918                         __jbd2_journal_refile_buffer(jh);
 919                         if (!jh->b_transaction) {
 920                                 jbd_unlock_bh_state(bh);
 921                                  /* needs a brelse */
 922                                 jbd2_journal_remove_journal_head(bh);
 923                                 release_buffer_page(bh);
 924                         } else
 925                                 jbd_unlock_bh_state(bh);
 926                 }
 927                 cond_resched_lock(&journal->j_list_lock);
 928         }
 929         spin_unlock(&journal->j_list_lock);
 930         /*
 931          * This is a bit sleazy.  We use j_list_lock to protect transition
 932          * of a transaction into T_FINISHED state and calling
 933          * __jbd2_journal_drop_transaction(). Otherwise we could race with
 934          * other checkpointing code processing the transaction...
 935          */
 936         write_lock(&journal->j_state_lock);
 937         spin_lock(&journal->j_list_lock);
 938         /*
 939          * Now recheck if some buffers did not get attached to the transaction
 940          * while the lock was dropped...
 941          */
 942         if (commit_transaction->t_forget) {
 943                 spin_unlock(&journal->j_list_lock);
 944                 write_unlock(&journal->j_state_lock);
 945                 goto restart_loop;
 946         }
 947
 948         /* Done with this transaction! */
 949
 950         jbd_debug(3, "JBD: commit phase 7\n");
 951
 952         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 953
 954         commit_transaction->t_start = jiffies;
 955         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
 956                                               commit_transaction->t_start);
 957
 958         /*
 959          * File the transaction statistics
 960          */
 961         stats.ts_tid = commit_transaction->t_tid;
 962         stats.run.rs_handle_count =
 963                 atomic_read(&commit_transaction->t_handle_count);
 964         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
 965                              commit_transaction->t_tid, &stats.run);
 966
 967         /*
 968          * Calculate overall stats
 969          */
 970         spin_lock(&journal->j_history_lock);
 971         journal->j_stats.ts_tid++;
 972         journal->j_stats.run.rs_wait += stats.run.rs_wait;
 973         journal->j_stats.run.rs_running += stats.run.rs_running;
 974         journal->j_stats.run.rs_locked += stats.run.rs_locked;
 975         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
 976         journal->j_stats.run.rs_logging += stats.run.rs_logging;
 977         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
 978         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
 979         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
 980         spin_unlock(&journal->j_history_lock);
 981
 982         commit_transaction->t_state = T_FINISHED;
 983         J_ASSERT(commit_transaction == journal->j_committing_transaction);
 984         journal->j_commit_sequence = commit_transaction->t_tid;
 985         journal->j_committing_transaction = NULL;
 986         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
 987
 988         /*
 989          * weight the commit time higher than the average time so we don't
 990          * react too strongly to vast changes in the commit time
 991          */
 992         if (likely(journal->j_average_commit_time))
 993                 journal->j_average_commit_time = (commit_time +
 994                                 journal->j_average_commit_time*3) / 4;
 995         else
 996                 journal->j_average_commit_time = commit_time;
 997         write_unlock(&journal->j_state_lock);
 998
 999         if (commit_transaction->t_checkpoint_list == NULL &&
1000             commit_transaction->t_checkpoint_io_list == NULL) {
1001                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1002                 to_free = 1;
1003         } else {
1004                 if (journal->j_checkpoint_transactions == NULL) {
1005                         journal->j_checkpoint_transactions = commit_transaction;
1006                         commit_transaction->t_cpnext = commit_transaction;
1007                         commit_transaction->t_cpprev = commit_transaction;
1008                 } else {
1009                         commit_transaction->t_cpnext =
1010                                 journal->j_checkpoint_transactions;
1011                         commit_transaction->t_cpprev =
1012                                 commit_transaction->t_cpnext->t_cpprev;
1013                         commit_transaction->t_cpnext->t_cpprev =
1014                                 commit_transaction;
1015                         commit_transaction->t_cpprev->t_cpnext =
1016                                 commit_transaction;
1017                 }
1018         }
1019         spin_unlock(&journal->j_list_lock);
1020
1021         if (journal->j_commit_callback)
1022                 journal->j_commit_callback(journal, commit_transaction);
1023
1024         trace_jbd2_end_commit(journal, commit_transaction);
1025         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1026                   journal->j_commit_sequence, journal->j_tail_sequence);
1027         if (to_free)
1028                 kfree(commit_transaction);
1029
1030         wake_up(&journal->j_wait_done_commit);
1031 }