fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/crc32.h>
  25 #include <linux/writeback.h>
  26 #include <linux/backing-dev.h>
  27 #include <linux/bio.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/bitops.h>
  30 #include <trace/events/jbd2.h>
  31 #include <asm/system.h>
  32
  33 /*
  34  * Default IO end handler for temporary BJ_IO buffer_heads.
  35  */
  36 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  37 {
  38         BUFFER_TRACE(bh, "");
  39         if (uptodate)
  40                 set_buffer_uptodate(bh);
  41         else
  42                 clear_buffer_uptodate(bh);
  43         unlock_buffer(bh);
  44 }
  45
  46 /*
  47  * When an ext4 file is truncated, it is possible that some pages are not
  48  * successfully freed, because they are attached to a committing transaction.
  49  * After the transaction commits, these pages are left on the LRU, with no
  50  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  51  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  52  * the numbers in /proc/meminfo look odd.
  53  *
  54  * So here, we have a buffer which has just come off the forget list.  Look to
  55  * see if we can strip all buffers from the backing page.
  56  *
  57  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  58  * caller provided us with a ref against the buffer, and we drop that here.
  59  */
  60 static void release_buffer_page(struct buffer_head *bh)
  61 {
  62         struct page *page;
  63
  64         if (buffer_dirty(bh))
  65                 goto nope;
  66         if (atomic_read(&bh->b_count) != 1)
  67                 goto nope;
  68         page = bh->b_page;
  69         if (!page)
  70                 goto nope;
  71         if (page->mapping)
  72                 goto nope;
  73
  74         /* OK, it's a truncated page */
  75         if (!trylock_page(page))
  76                 goto nope;
  77
  78         page_cache_get(page);
  79         __brelse(bh);
  80         try_to_free_buffers(page);
  81         unlock_page(page);
  82         page_cache_release(page);
  83         return;
  84
  85 nope:
  86         __brelse(bh);
  87 }
  88
  89 /*
  90  * Done it all: now submit the commit record.  We should have
  91  * cleaned up our previous buffers by now, so if we are in abort
  92  * mode we can now just skip the rest of the journal write
  93  * entirely.
  94  *
  95  * Returns 1 if the journal needs to be aborted or 0 on success
  96  */
  97 static int journal_submit_commit_record(journal_t *journal,
  98                                         transaction_t *commit_transaction,
  99                                         struct buffer_head **cbh,
 100                                         __u32 crc32_sum)
 101 {
 102         struct journal_head *descriptor;
 103         struct commit_header *tmp;
 104         struct buffer_head *bh;
 105         int ret;
 106         struct timespec now = current_kernel_time();
 107
 108         if (is_journal_aborted(journal))
 109                 return 0;
 110
 111         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 112         if (!descriptor)
 113                 return 1;
 114
 115         bh = jh2bh(descriptor);
 116
 117         tmp = (struct commit_header *)bh->b_data;
 118         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 119         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 120         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 121         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 122         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 123
 124         if (JBD2_HAS_COMPAT_FEATURE(journal,
 125                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
 126                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 127                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 128                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 129         }
 130
 131         JBUFFER_TRACE(descriptor, "submit commit block");
 132         lock_buffer(bh);
 133         clear_buffer_dirty(bh);
 134         set_buffer_uptodate(bh);
 135         bh->b_end_io = journal_end_buffer_io_sync;
 136
 137         if (journal->j_flags & JBD2_BARRIER &&
 138             !JBD2_HAS_INCOMPAT_FEATURE(journal,
 139                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
 140                 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
 141         else
 142                 ret = submit_bh(WRITE_SYNC, bh);
 143
 144         *cbh = bh;
 145         return ret;
 146 }
 147
 148 /*
 149  * This function along with journal_submit_commit_record
 150  * allows to write the commit record asynchronously.
 151  */
 152 static int journal_wait_on_commit_record(journal_t *journal,
 153                                          struct buffer_head *bh)
 154 {
 155         int ret = 0;
 156
 157         clear_buffer_dirty(bh);
 158         wait_on_buffer(bh);
 159
 160         if (unlikely(!buffer_uptodate(bh)))
 161                 ret = -EIO;
 162         put_bh(bh);            /* One for getblk() */
 163         jbd2_journal_put_journal_head(bh2jh(bh));
 164
 165         return ret;
 166 }
 167
 168 /*
 169  * write the filemap data using writepage() address_space_operations.
 170  * We don't do block allocation here even for delalloc. We don't
 171  * use writepages() because with dealyed allocation we may be doing
 172  * block allocation in writepages().
 173  */
 174 static int journal_submit_inode_data_buffers(struct address_space *mapping)
 175 {
 176         int ret;
 177         struct writeback_control wbc = {
 178                 .sync_mode =  WB_SYNC_ALL,
 179                 .nr_to_write = mapping->nrpages * 2,
 180                 .range_start = 0,
 181                 .range_end = i_size_read(mapping->host),
 182         };
 183
 184         ret = generic_writepages(mapping, &wbc);
 185         return ret;
 186 }
 187
 188 /*
 189  * Submit all the data buffers of inode associated with the transaction to
 190  * disk.
 191  *
 192  * We are in a committing transaction. Therefore no new inode can be added to
 193  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 194  * operate on from being released while we write out pages.
 195  */
 196 static int journal_submit_data_buffers(journal_t *journal,
 197                 transaction_t *commit_transaction)
 198 {
 199         struct jbd2_inode *jinode;
 200         int err, ret = 0;
 201         struct address_space *mapping;
 202
 203         spin_lock(&journal->j_list_lock);
 204         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 205                 mapping = jinode->i_vfs_inode->i_mapping;
 206                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 207                 spin_unlock(&journal->j_list_lock);
 208                 /*
 209                  * submit the inode data buffers. We use writepage
 210                  * instead of writepages. Because writepages can do
 211                  * block allocation  with delalloc. We need to write
 212                  * only allocated blocks here.
 213                  */
 214                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 215                 err = journal_submit_inode_data_buffers(mapping);
 216                 if (!ret)
 217                         ret = err;
 218                 spin_lock(&journal->j_list_lock);
 219                 J_ASSERT(jinode->i_transaction == commit_transaction);
 220                 commit_transaction->t_flushed_data_blocks = 1;
 221                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 222                 smp_mb__after_clear_bit();
 223                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 224         }
 225         spin_unlock(&journal->j_list_lock);
 226         return ret;
 227 }
 228
 229 /*
 230  * Wait for data submitted for writeout, refile inodes to proper
 231  * transaction if needed.
 232  *
 233  */
 234 static int journal_finish_inode_data_buffers(journal_t *journal,
 235                 transaction_t *commit_transaction)
 236 {
 237         struct jbd2_inode *jinode, *next_i;
 238         int err, ret = 0;
 239
 240         /* For locking, see the comment in journal_submit_data_buffers() */
 241         spin_lock(&journal->j_list_lock);
 242         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 243                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 244                 spin_unlock(&journal->j_list_lock);
 245                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
 246                 if (err) {
 247                         /*
 248                          * Because AS_EIO is cleared by
 249                          * filemap_fdatawait_range(), set it again so
 250                          * that user process can get -EIO from fsync().
 251                          */
 252                         set_bit(AS_EIO,
 253                                 &jinode->i_vfs_inode->i_mapping->flags);
 254
 255                         if (!ret)
 256                                 ret = err;
 257                 }
 258                 spin_lock(&journal->j_list_lock);
 259                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 260                 smp_mb__after_clear_bit();
 261                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 262         }
 263
 264         /* Now refile inode to proper lists */
 265         list_for_each_entry_safe(jinode, next_i,
 266                                  &commit_transaction->t_inode_list, i_list) {
 267                 list_del(&jinode->i_list);
 268                 if (jinode->i_next_transaction) {
 269                         jinode->i_transaction = jinode->i_next_transaction;
 270                         jinode->i_next_transaction = NULL;
 271                         list_add(&jinode->i_list,
 272                                 &jinode->i_transaction->t_inode_list);
 273                 } else {
 274                         jinode->i_transaction = NULL;
 275                 }
 276         }
 277         spin_unlock(&journal->j_list_lock);
 278
 279         return ret;
 280 }
 281
 282 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 283 {
 284         struct page *page = bh->b_page;
 285         char *addr;
 286         __u32 checksum;
 287
 288         addr = kmap_atomic(page, KM_USER0);
 289         checksum = crc32_be(crc32_sum,
 290                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 291         kunmap_atomic(addr, KM_USER0);
 292
 293         return checksum;
 294 }
 295
 296 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 297                                    unsigned long long block)
 298 {
 299         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 300         if (tag_bytes > JBD2_TAG_SIZE32)
 301                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 302 }
 303
 304 /*
 305  * jbd2_journal_commit_transaction
 306  *
 307  * The primary function for committing a transaction to the log.  This
 308  * function is called by the journal thread to begin a complete commit.
 309  */
 310 void jbd2_journal_commit_transaction(journal_t *journal)
 311 {
 312         struct transaction_stats_s stats;
 313         transaction_t *commit_transaction;
 314         struct journal_head *jh, *new_jh, *descriptor;
 315         struct buffer_head **wbuf = journal->j_wbuf;
 316         int bufs;
 317         int flags;
 318         int err;
 319         unsigned long long blocknr;
 320         ktime_t start_time;
 321         u64 commit_time;
 322         char *tagp = NULL;
 323         journal_header_t *header;
 324         journal_block_tag_t *tag = NULL;
 325         int space_left = 0;
 326         int first_tag = 0;
 327         int tag_flag;
 328         int i, to_free = 0;
 329         int tag_bytes = journal_tag_bytes(journal);
 330         struct buffer_head *cbh = NULL; /* For transactional checksums */
 331         __u32 crc32_sum = ~0;
 332         struct blk_plug plug;
 333
 334         /*
 335          * First job: lock down the current transaction and wait for
 336          * all outstanding updates to complete.
 337          */
 338
 339 #ifdef COMMIT_STATS
 340         spin_lock(&journal->j_list_lock);
 341         summarise_journal_usage(journal);
 342         spin_unlock(&journal->j_list_lock);
 343 #endif
 344
 345         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 346         if (journal->j_flags & JBD2_FLUSHED) {
 347                 jbd_debug(3, "super block updated\n");
 348                 jbd2_journal_update_superblock(journal, 1);
 349         } else {
 350                 jbd_debug(3, "superblock not updated\n");
 351         }
 352
 353         J_ASSERT(journal->j_running_transaction != NULL);
 354         J_ASSERT(journal->j_committing_transaction == NULL);
 355
 356         commit_transaction = journal->j_running_transaction;
 357         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 358
 359         trace_jbd2_start_commit(journal, commit_transaction);
 360         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 361                         commit_transaction->t_tid);
 362
 363         write_lock(&journal->j_state_lock);
 364         commit_transaction->t_state = T_LOCKED;
 365
 366         trace_jbd2_commit_locking(journal, commit_transaction);
 367         stats.run.rs_wait = commit_transaction->t_max_wait;
 368         stats.run.rs_locked = jiffies;
 369         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 370                                               stats.run.rs_locked);
 371
 372         spin_lock(&commit_transaction->t_handle_lock);
 373         while (atomic_read(&commit_transaction->t_updates)) {
 374                 DEFINE_WAIT(wait);
 375
 376                 prepare_to_wait(&journal->j_wait_updates, &wait,
 377                                         TASK_UNINTERRUPTIBLE);
 378                 if (atomic_read(&commit_transaction->t_updates)) {
 379                         spin_unlock(&commit_transaction->t_handle_lock);
 380                         write_unlock(&journal->j_state_lock);
 381                         schedule();
 382                         write_lock(&journal->j_state_lock);
 383                         spin_lock(&commit_transaction->t_handle_lock);
 384                 }
 385                 finish_wait(&journal->j_wait_updates, &wait);
 386         }
 387         spin_unlock(&commit_transaction->t_handle_lock);
 388
 389         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 390                         journal->j_max_transaction_buffers);
 391
 392         /*
 393          * First thing we are allowed to do is to discard any remaining
 394          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 395          * that there are no such buffers: if a large filesystem
 396          * operation like a truncate needs to split itself over multiple
 397          * transactions, then it may try to do a jbd2_journal_restart() while
 398          * there are still BJ_Reserved buffers outstanding.  These must
 399          * be released cleanly from the current transaction.
 400          *
 401          * In this case, the filesystem must still reserve write access
 402          * again before modifying the buffer in the new transaction, but
 403          * we do not require it to remember exactly which old buffers it
 404          * has reserved.  This is consistent with the existing behaviour
 405          * that multiple jbd2_journal_get_write_access() calls to the same
 406          * buffer are perfectly permissible.
 407          */
 408         while (commit_transaction->t_reserved_list) {
 409                 jh = commit_transaction->t_reserved_list;
 410                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 411                 /*
 412                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 413                  * leave undo-committed data.
 414                  */
 415                 if (jh->b_committed_data) {
 416                         struct buffer_head *bh = jh2bh(jh);
 417
 418                         jbd_lock_bh_state(bh);
 419                         jbd2_free(jh->b_committed_data, bh->b_size);
 420                         jh->b_committed_data = NULL;
 421                         jbd_unlock_bh_state(bh);
 422                 }
 423                 jbd2_journal_refile_buffer(journal, jh);
 424         }
 425
 426         /*
 427          * Now try to drop any written-back buffers from the journal's
 428          * checkpoint lists.  We do this *before* commit because it potentially
 429          * frees some memory
 430          */
 431         spin_lock(&journal->j_list_lock);
 432         __jbd2_journal_clean_checkpoint_list(journal);
 433         spin_unlock(&journal->j_list_lock);
 434
 435         jbd_debug (3, "JBD: commit phase 1\n");
 436
 437         /*
 438          * Switch to a new revoke table.
 439          */
 440         jbd2_journal_switch_revoke_table(journal);
 441
 442         trace_jbd2_commit_flushing(journal, commit_transaction);
 443         stats.run.rs_flushing = jiffies;
 444         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
 445                                              stats.run.rs_flushing);
 446
 447         commit_transaction->t_state = T_FLUSH;
 448         journal->j_committing_transaction = commit_transaction;
 449         journal->j_running_transaction = NULL;
 450         start_time = ktime_get();
 451         commit_transaction->t_log_start = journal->j_head;
 452         wake_up(&journal->j_wait_transaction_locked);
 453         write_unlock(&journal->j_state_lock);
 454
 455         jbd_debug (3, "JBD: commit phase 2\n");
 456
 457         /*
 458          * Now start flushing things to disk, in the order they appear
 459          * on the transaction lists.  Data blocks go first.
 460          */
 461         err = journal_submit_data_buffers(journal, commit_transaction);
 462         if (err)
 463                 jbd2_journal_abort(journal, err);
 464
 465         blk_start_plug(&plug);
 466         jbd2_journal_write_revoke_records(journal, commit_transaction,
 467                                           WRITE_SYNC);
 468         blk_finish_plug(&plug);
 469
 470         jbd_debug(3, "JBD: commit phase 2\n");
 471
 472         /*
 473          * Way to go: we have now written out all of the data for a
 474          * transaction!  Now comes the tricky part: we need to write out
 475          * metadata.  Loop over the transaction's entire buffer list:
 476          */
 477         write_lock(&journal->j_state_lock);
 478         commit_transaction->t_state = T_COMMIT;
 479         write_unlock(&journal->j_state_lock);
 480
 481         trace_jbd2_commit_logging(journal, commit_transaction);
 482         stats.run.rs_logging = jiffies;
 483         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 484                                                stats.run.rs_logging);
 485         stats.run.rs_blocks =
 486                 atomic_read(&commit_transaction->t_outstanding_credits);
 487         stats.run.rs_blocks_logged = 0;
 488
 489         J_ASSERT(commit_transaction->t_nr_buffers <=
 490                  atomic_read(&commit_transaction->t_outstanding_credits));
 491
 492         err = 0;
 493         descriptor = NULL;
 494         bufs = 0;
 495         blk_start_plug(&plug);
 496         while (commit_transaction->t_buffers) {
 497
 498                 /* Find the next buffer to be journaled... */
 499
 500                 jh = commit_transaction->t_buffers;
 501
 502                 /* If we're in abort mode, we just un-journal the buffer and
 503                    release it. */
 504
 505                 if (is_journal_aborted(journal)) {
 506                         clear_buffer_jbddirty(jh2bh(jh));
 507                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 508                         jbd2_buffer_abort_trigger(jh,
 509                                                   jh->b_frozen_data ?
 510                                                   jh->b_frozen_triggers :
 511                                                   jh->b_triggers);
 512                         jbd2_journal_refile_buffer(journal, jh);
 513                         /* If that was the last one, we need to clean up
 514                          * any descriptor buffers which may have been
 515                          * already allocated, even if we are now
 516                          * aborting. */
 517                         if (!commit_transaction->t_buffers)
 518                                 goto start_journal_io;
 519                         continue;
 520                 }
 521
 522                 /* Make sure we have a descriptor block in which to
 523                    record the metadata buffer. */
 524
 525                 if (!descriptor) {
 526                         struct buffer_head *bh;
 527
 528                         J_ASSERT (bufs == 0);
 529
 530                         jbd_debug(4, "JBD: get descriptor\n");
 531
 532                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 533                         if (!descriptor) {
 534                                 jbd2_journal_abort(journal, -EIO);
 535                                 continue;
 536                         }
 537
 538                         bh = jh2bh(descriptor);
 539                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 540                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 541                         header = (journal_header_t *)&bh->b_data[0];
 542                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 543                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 544                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 545
 546                         tagp = &bh->b_data[sizeof(journal_header_t)];
 547                         space_left = bh->b_size - sizeof(journal_header_t);
 548                         first_tag = 1;
 549                         set_buffer_jwrite(bh);
 550                         set_buffer_dirty(bh);
 551                         wbuf[bufs++] = bh;
 552
 553                         /* Record it so that we can wait for IO
 554                            completion later */
 555                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 556                         jbd2_journal_file_buffer(descriptor, commit_transaction,
 557                                         BJ_LogCtl);
 558                 }
 559
 560                 /* Where is the buffer to be written? */
 561
 562                 err = jbd2_journal_next_log_block(journal, &blocknr);
 563                 /* If the block mapping failed, just abandon the buffer
 564                    and repeat this loop: we'll fall into the
 565                    refile-on-abort condition above. */
 566                 if (err) {
 567                         jbd2_journal_abort(journal, err);
 568                         continue;
 569                 }
 570
 571                 /*
 572                  * start_this_handle() uses t_outstanding_credits to determine
 573                  * the free space in the log, but this counter is changed
 574                  * by jbd2_journal_next_log_block() also.
 575                  */
 576                 atomic_dec(&commit_transaction->t_outstanding_credits);
 577
 578                 /* Bump b_count to prevent truncate from stumbling over
 579                    the shadowed buffer!  @@@ This can go if we ever get
 580                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 581                 atomic_inc(&jh2bh(jh)->b_count);
 582
 583                 /* Make a temporary IO buffer with which to write it out
 584                    (this will requeue both the metadata buffer and the
 585                    temporary IO buffer). new_bh goes on BJ_IO*/
 586
 587                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 588                 /*
 589                  * akpm: jbd2_journal_write_metadata_buffer() sets
 590                  * new_bh->b_transaction to commit_transaction.
 591                  * We need to clean this up before we release new_bh
 592                  * (which is of type BJ_IO)
 593                  */
 594                 JBUFFER_TRACE(jh, "ph3: write metadata");
 595                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 596                                                       jh, &new_jh, blocknr);
 597                 if (flags < 0) {
 598                         jbd2_journal_abort(journal, flags);
 599                         continue;
 600                 }
 601                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 602                 wbuf[bufs++] = jh2bh(new_jh);
 603
 604                 /* Record the new block's tag in the current descriptor
 605                    buffer */
 606
 607                 tag_flag = 0;
 608                 if (flags & 1)
 609                         tag_flag |= JBD2_FLAG_ESCAPE;
 610                 if (!first_tag)
 611                         tag_flag |= JBD2_FLAG_SAME_UUID;
 612
 613                 tag = (journal_block_tag_t *) tagp;
 614                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 615                 tag->t_flags = cpu_to_be32(tag_flag);
 616                 tagp += tag_bytes;
 617                 space_left -= tag_bytes;
 618
 619                 if (first_tag) {
 620                         memcpy (tagp, journal->j_uuid, 16);
 621                         tagp += 16;
 622                         space_left -= 16;
 623                         first_tag = 0;
 624                 }
 625
 626                 /* If there's no more to do, or if the descriptor is full,
 627                    let the IO rip! */
 628
 629                 if (bufs == journal->j_wbufsize ||
 630                     commit_transaction->t_buffers == NULL ||
 631                     space_left < tag_bytes + 16) {
 632
 633                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 634
 635                         /* Write an end-of-descriptor marker before
 636                            submitting the IOs.  "tag" still points to
 637                            the last tag we set up. */
 638
 639                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 640
 641 start_journal_io:
 642                         for (i = 0; i < bufs; i++) {
 643                                 struct buffer_head *bh = wbuf[i];
 644                                 /*
 645                                  * Compute checksum.
 646                                  */
 647                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
 648                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
 649                                         crc32_sum =
 650                                             jbd2_checksum_data(crc32_sum, bh);
 651                                 }
 652
 653                                 lock_buffer(bh);
 654                                 clear_buffer_dirty(bh);
 655                                 set_buffer_uptodate(bh);
 656                                 bh->b_end_io = journal_end_buffer_io_sync;
 657                                 submit_bh(WRITE_SYNC, bh);
 658                         }
 659                         cond_resched();
 660                         stats.run.rs_blocks_logged += bufs;
 661
 662                         /* Force a new descriptor to be generated next
 663                            time round the loop. */
 664                         descriptor = NULL;
 665                         bufs = 0;
 666                 }
 667         }
 668
 669         err = journal_finish_inode_data_buffers(journal, commit_transaction);
 670         if (err) {
 671                 printk(KERN_WARNING
 672                         "JBD2: Detected IO errors while flushing file data "
 673                        "on %s\n", journal->j_devname);
 674                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 675                         jbd2_journal_abort(journal, err);
 676                 err = 0;
 677         }
 678
 679         /*
 680          * If the journal is not located on the file system device,
 681          * then we must flush the file system device before we issue
 682          * the commit record
 683          */
 684         if (commit_transaction->t_flushed_data_blocks &&
 685             (journal->j_fs_dev != journal->j_dev) &&
 686             (journal->j_flags & JBD2_BARRIER))
 687                 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 688
 689         /* Done it all: now write the commit record asynchronously. */
 690         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 691                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 692                 err = journal_submit_commit_record(journal, commit_transaction,
 693                                                  &cbh, crc32_sum);
 694                 if (err)
 695                         __jbd2_journal_abort_hard(journal);
 696         }
 697
 698         blk_finish_plug(&plug);
 699
 700         /* Lo and behold: we have just managed to send a transaction to
 701            the log.  Before we can commit it, wait for the IO so far to
 702            complete.  Control buffers being written are on the
 703            transaction's t_log_list queue, and metadata buffers are on
 704            the t_iobuf_list queue.
 705
 706            Wait for the buffers in reverse order.  That way we are
 707            less likely to be woken up until all IOs have completed, and
 708            so we incur less scheduling load.
 709         */
 710
 711         jbd_debug(3, "JBD: commit phase 3\n");
 712
 713         /*
 714          * akpm: these are BJ_IO, and j_list_lock is not needed.
 715          * See __journal_try_to_free_buffer.
 716          */
 717 wait_for_iobuf:
 718         while (commit_transaction->t_iobuf_list != NULL) {
 719                 struct buffer_head *bh;
 720
 721                 jh = commit_transaction->t_iobuf_list->b_tprev;
 722                 bh = jh2bh(jh);
 723                 if (buffer_locked(bh)) {
 724                         wait_on_buffer(bh);
 725                         goto wait_for_iobuf;
 726                 }
 727                 if (cond_resched())
 728                         goto wait_for_iobuf;
 729
 730                 if (unlikely(!buffer_uptodate(bh)))
 731                         err = -EIO;
 732
 733                 clear_buffer_jwrite(bh);
 734
 735                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 736                 jbd2_journal_unfile_buffer(journal, jh);
 737
 738                 /*
 739                  * ->t_iobuf_list should contain only dummy buffer_heads
 740                  * which were created by jbd2_journal_write_metadata_buffer().
 741                  */
 742                 BUFFER_TRACE(bh, "dumping temporary bh");
 743                 jbd2_journal_put_journal_head(jh);
 744                 __brelse(bh);
 745                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 746                 free_buffer_head(bh);
 747
 748                 /* We also have to unlock and free the corresponding
 749                    shadowed buffer */
 750                 jh = commit_transaction->t_shadow_list->b_tprev;
 751                 bh = jh2bh(jh);
 752                 clear_bit(BH_JWrite, &bh->b_state);
 753                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 754
 755                 /* The metadata is now released for reuse, but we need
 756                    to remember it against this transaction so that when
 757                    we finally commit, we can do any checkpointing
 758                    required. */
 759                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 760                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 761                 /* Wake up any transactions which were waiting for this
 762                    IO to complete */
 763                 wake_up_bit(&bh->b_state, BH_Unshadow);
 764                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 765                 __brelse(bh);
 766         }
 767
 768         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 769
 770         jbd_debug(3, "JBD: commit phase 4\n");
 771
 772         /* Here we wait for the revoke record and descriptor record buffers */
 773  wait_for_ctlbuf:
 774         while (commit_transaction->t_log_list != NULL) {
 775                 struct buffer_head *bh;
 776
 777                 jh = commit_transaction->t_log_list->b_tprev;
 778                 bh = jh2bh(jh);
 779                 if (buffer_locked(bh)) {
 780                         wait_on_buffer(bh);
 781                         goto wait_for_ctlbuf;
 782                 }
 783                 if (cond_resched())
 784                         goto wait_for_ctlbuf;
 785
 786                 if (unlikely(!buffer_uptodate(bh)))
 787                         err = -EIO;
 788
 789                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 790                 clear_buffer_jwrite(bh);
 791                 jbd2_journal_unfile_buffer(journal, jh);
 792                 jbd2_journal_put_journal_head(jh);
 793                 __brelse(bh);           /* One for getblk */
 794                 /* AKPM: bforget here */
 795         }
 796
 797         if (err)
 798                 jbd2_journal_abort(journal, err);
 799
 800         jbd_debug(3, "JBD: commit phase 5\n");
 801
 802         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 803                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 804                 err = journal_submit_commit_record(journal, commit_transaction,
 805                                                 &cbh, crc32_sum);
 806                 if (err)
 807                         __jbd2_journal_abort_hard(journal);
 808         }
 809         if (!err && !is_journal_aborted(journal))
 810                 err = journal_wait_on_commit_record(journal, cbh);
 811         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 812                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
 813             journal->j_flags & JBD2_BARRIER) {
 814                 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
 815         }
 816
 817         if (err)
 818                 jbd2_journal_abort(journal, err);
 819
 820         /* End of a transaction!  Finally, we can do checkpoint
 821            processing: any buffers committed as a result of this
 822            transaction can be removed from any checkpoint list it was on
 823            before. */
 824
 825         jbd_debug(3, "JBD: commit phase 6\n");
 826
 827         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 828         J_ASSERT(commit_transaction->t_buffers == NULL);
 829         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 830         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 831         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 832         J_ASSERT(commit_transaction->t_log_list == NULL);
 833
 834 restart_loop:
 835         /*
 836          * As there are other places (journal_unmap_buffer()) adding buffers
 837          * to this list we have to be careful and hold the j_list_lock.
 838          */
 839         spin_lock(&journal->j_list_lock);
 840         while (commit_transaction->t_forget) {
 841                 transaction_t *cp_transaction;
 842                 struct buffer_head *bh;
 843
 844                 jh = commit_transaction->t_forget;
 845                 spin_unlock(&journal->j_list_lock);
 846                 bh = jh2bh(jh);
 847                 jbd_lock_bh_state(bh);
 848                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
 849
 850                 /*
 851                  * If there is undo-protected committed data against
 852                  * this buffer, then we can remove it now.  If it is a
 853                  * buffer needing such protection, the old frozen_data
 854                  * field now points to a committed version of the
 855                  * buffer, so rotate that field to the new committed
 856                  * data.
 857                  *
 858                  * Otherwise, we can just throw away the frozen data now.
 859                  *
 860                  * We also know that the frozen data has already fired
 861                  * its triggers if they exist, so we can clear that too.
 862                  */
 863                 if (jh->b_committed_data) {
 864                         jbd2_free(jh->b_committed_data, bh->b_size);
 865                         jh->b_committed_data = NULL;
 866                         if (jh->b_frozen_data) {
 867                                 jh->b_committed_data = jh->b_frozen_data;
 868                                 jh->b_frozen_data = NULL;
 869                                 jh->b_frozen_triggers = NULL;
 870                         }
 871                 } else if (jh->b_frozen_data) {
 872                         jbd2_free(jh->b_frozen_data, bh->b_size);
 873                         jh->b_frozen_data = NULL;
 874                         jh->b_frozen_triggers = NULL;
 875                 }
 876
 877                 spin_lock(&journal->j_list_lock);
 878                 cp_transaction = jh->b_cp_transaction;
 879                 if (cp_transaction) {
 880                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 881                         cp_transaction->t_chp_stats.cs_dropped++;
 882                         __jbd2_journal_remove_checkpoint(jh);
 883                 }
 884
 885                 /* Only re-checkpoint the buffer_head if it is marked
 886                  * dirty.  If the buffer was added to the BJ_Forget list
 887                  * by jbd2_journal_forget, it may no longer be dirty and
 888                  * there's no point in keeping a checkpoint record for
 889                  * it. */
 890
 891                 /* A buffer which has been freed while still being
 892                  * journaled by a previous transaction may end up still
 893                  * being dirty here, but we want to avoid writing back
 894                  * that buffer in the future after the "add to orphan"
 895                  * operation been committed,  That's not only a performance
 896                  * gain, it also stops aliasing problems if the buffer is
 897                  * left behind for writeback and gets reallocated for another
 898                  * use in a different page. */
 899                 if (buffer_freed(bh) && !jh->b_next_transaction) {
 900                         clear_buffer_freed(bh);
 901                         clear_buffer_jbddirty(bh);
 902                 }
 903
 904                 if (buffer_jbddirty(bh)) {
 905                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 906                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 907                         if (is_journal_aborted(journal))
 908                                 clear_buffer_jbddirty(bh);
 909                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 910                         __jbd2_journal_refile_buffer(jh);
 911                         jbd_unlock_bh_state(bh);
 912                 } else {
 913                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 914                         /* The buffer on BJ_Forget list and not jbddirty means
 915                          * it has been freed by this transaction and hence it
 916                          * could not have been reallocated until this
 917                          * transaction has committed. *BUT* it could be
 918                          * reallocated once we have written all the data to
 919                          * disk and before we process the buffer on BJ_Forget
 920                          * list. */
 921                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 922                         __jbd2_journal_refile_buffer(jh);
 923                         if (!jh->b_transaction) {
 924                                 jbd_unlock_bh_state(bh);
 925                                  /* needs a brelse */
 926                                 jbd2_journal_remove_journal_head(bh);
 927                                 release_buffer_page(bh);
 928                         } else
 929                                 jbd_unlock_bh_state(bh);
 930                 }
 931                 cond_resched_lock(&journal->j_list_lock);
 932         }
 933         spin_unlock(&journal->j_list_lock);
 934         /*
 935          * This is a bit sleazy.  We use j_list_lock to protect transition
 936          * of a transaction into T_FINISHED state and calling
 937          * __jbd2_journal_drop_transaction(). Otherwise we could race with
 938          * other checkpointing code processing the transaction...
 939          */
 940         write_lock(&journal->j_state_lock);
 941         spin_lock(&journal->j_list_lock);
 942         /*
 943          * Now recheck if some buffers did not get attached to the transaction
 944          * while the lock was dropped...
 945          */
 946         if (commit_transaction->t_forget) {
 947                 spin_unlock(&journal->j_list_lock);
 948                 write_unlock(&journal->j_state_lock);
 949                 goto restart_loop;
 950         }
 951
 952         /* Done with this transaction! */
 953
 954         jbd_debug(3, "JBD: commit phase 7\n");
 955
 956         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 957
 958         commit_transaction->t_start = jiffies;
 959         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
 960                                               commit_transaction->t_start);
 961
 962         /*
 963          * File the transaction statistics
 964          */
 965         stats.ts_tid = commit_transaction->t_tid;
 966         stats.run.rs_handle_count =
 967                 atomic_read(&commit_transaction->t_handle_count);
 968         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
 969                              commit_transaction->t_tid, &stats.run);
 970
 971         /*
 972          * Calculate overall stats
 973          */
 974         spin_lock(&journal->j_history_lock);
 975         journal->j_stats.ts_tid++;
 976         journal->j_stats.run.rs_wait += stats.run.rs_wait;
 977         journal->j_stats.run.rs_running += stats.run.rs_running;
 978         journal->j_stats.run.rs_locked += stats.run.rs_locked;
 979         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
 980         journal->j_stats.run.rs_logging += stats.run.rs_logging;
 981         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
 982         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
 983         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
 984         spin_unlock(&journal->j_history_lock);
 985
 986         commit_transaction->t_state = T_FINISHED;
 987         J_ASSERT(commit_transaction == journal->j_committing_transaction);
 988         journal->j_commit_sequence = commit_transaction->t_tid;
 989         journal->j_committing_transaction = NULL;
 990         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
 991
 992         /*
 993          * weight the commit time higher than the average time so we don't
 994          * react too strongly to vast changes in the commit time
 995          */
 996         if (likely(journal->j_average_commit_time))
 997                 journal->j_average_commit_time = (commit_time +
 998                                 journal->j_average_commit_time*3) / 4;
 999         else
1000                 journal->j_average_commit_time = commit_time;
1001         write_unlock(&journal->j_state_lock);
1002
1003         if (commit_transaction->t_checkpoint_list == NULL &&
1004             commit_transaction->t_checkpoint_io_list == NULL) {
1005                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1006                 to_free = 1;
1007         } else {
1008                 if (journal->j_checkpoint_transactions == NULL) {
1009                         journal->j_checkpoint_transactions = commit_transaction;
1010                         commit_transaction->t_cpnext = commit_transaction;
1011                         commit_transaction->t_cpprev = commit_transaction;
1012                 } else {
1013                         commit_transaction->t_cpnext =
1014                                 journal->j_checkpoint_transactions;
1015                         commit_transaction->t_cpprev =
1016                                 commit_transaction->t_cpnext->t_cpprev;
1017                         commit_transaction->t_cpnext->t_cpprev =
1018                                 commit_transaction;
1019                         commit_transaction->t_cpprev->t_cpnext =
1020                                 commit_transaction;
1021                 }
1022         }
1023         spin_unlock(&journal->j_list_lock);
1024
1025         if (journal->j_commit_callback)
1026                 journal->j_commit_callback(journal, commit_transaction);
1027
1028         trace_jbd2_end_commit(journal, commit_transaction);
1029         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1030                   journal->j_commit_sequence, journal->j_tail_sequence);
1031         if (to_free)
1032                 kfree(commit_transaction);
1033
1034         wake_up(&journal->j_wait_done_commit);
1035 }