2 * linux/fs/jbd2/commit.c
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
16 #include <linux/time.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 #include <asm/system.h>
34 * Default IO end handler for temporary BJ_IO buffer_heads.
36 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
40 set_buffer_uptodate(bh);
42 clear_buffer_uptodate(bh);
47 * When an ext4 file is truncated, it is possible that some pages are not
48 * successfully freed, because they are attached to a committing transaction.
49 * After the transaction commits, these pages are left on the LRU, with no
50 * ->mapping, and with attached buffers. These pages are trivially reclaimable
51 * by the VM, but their apparent absence upsets the VM accounting, and it makes
52 * the numbers in /proc/meminfo look odd.
54 * So here, we have a buffer which has just come off the forget list. Look to
55 * see if we can strip all buffers from the backing page.
57 * Called under lock_journal(), and possibly under journal_datalist_lock. The
58 * caller provided us with a ref against the buffer, and we drop that here.
60 static void release_buffer_page(struct buffer_head *bh)
66 if (atomic_read(&bh->b_count) != 1)
74 /* OK, it's a truncated page */
75 if (!trylock_page(page))
80 try_to_free_buffers(page);
82 page_cache_release(page);
90 * Done it all: now submit the commit record. We should have
91 * cleaned up our previous buffers by now, so if we are in abort
92 * mode we can now just skip the rest of the journal write
95 * Returns 1 if the journal needs to be aborted or 0 on success
97 static int journal_submit_commit_record(journal_t *journal,
98 transaction_t *commit_transaction,
99 struct buffer_head **cbh,
102 struct journal_head *descriptor;
103 struct commit_header *tmp;
104 struct buffer_head *bh;
106 struct timespec now = current_kernel_time();
108 if (is_journal_aborted(journal))
111 descriptor = jbd2_journal_get_descriptor_buffer(journal);
115 bh = jh2bh(descriptor);
117 tmp = (struct commit_header *)bh->b_data;
118 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
119 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
120 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
121 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
122 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
124 if (JBD2_HAS_COMPAT_FEATURE(journal,
125 JBD2_FEATURE_COMPAT_CHECKSUM)) {
126 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
127 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
128 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
131 JBUFFER_TRACE(descriptor, "submit commit block");
133 clear_buffer_dirty(bh);
134 set_buffer_uptodate(bh);
135 bh->b_end_io = journal_end_buffer_io_sync;
137 if (journal->j_flags & JBD2_BARRIER &&
138 !JBD2_HAS_INCOMPAT_FEATURE(journal,
139 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
140 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
141 if (ret == -EOPNOTSUPP) {
143 "JBD2: Disabling barriers on %s, "
144 "not supported by device\n", journal->j_devname);
145 write_lock(&journal->j_state_lock);
146 journal->j_flags &= ~JBD2_BARRIER;
147 write_unlock(&journal->j_state_lock);
149 /* And try again, without the barrier */
151 set_buffer_uptodate(bh);
152 clear_buffer_dirty(bh);
153 ret = submit_bh(WRITE_SYNC_PLUG, bh);
156 ret = submit_bh(WRITE_SYNC_PLUG, bh);
163 * This function along with journal_submit_commit_record
164 * allows to write the commit record asynchronously.
166 static int journal_wait_on_commit_record(journal_t *journal,
167 struct buffer_head *bh)
172 clear_buffer_dirty(bh);
174 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
176 "JBD2: %s: disabling barries on %s - not supported "
177 "by device\n", __func__, journal->j_devname);
178 write_lock(&journal->j_state_lock);
179 journal->j_flags &= ~JBD2_BARRIER;
180 write_unlock(&journal->j_state_lock);
183 clear_buffer_dirty(bh);
184 set_buffer_uptodate(bh);
185 bh->b_end_io = journal_end_buffer_io_sync;
187 ret = submit_bh(WRITE_SYNC_PLUG, bh);
195 if (unlikely(!buffer_uptodate(bh)))
197 put_bh(bh); /* One for getblk() */
198 jbd2_journal_put_journal_head(bh2jh(bh));
204 * write the filemap data using writepage() address_space_operations.
205 * We don't do block allocation here even for delalloc. We don't
206 * use writepages() because with dealyed allocation we may be doing
207 * block allocation in writepages().
209 static int journal_submit_inode_data_buffers(struct address_space *mapping)
212 struct writeback_control wbc = {
213 .sync_mode = WB_SYNC_ALL,
214 .nr_to_write = mapping->nrpages * 2,
216 .range_end = i_size_read(mapping->host),
219 ret = generic_writepages(mapping, &wbc);
224 * Submit all the data buffers of inode associated with the transaction to
227 * We are in a committing transaction. Therefore no new inode can be added to
228 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
229 * operate on from being released while we write out pages.
231 static int journal_submit_data_buffers(journal_t *journal,
232 transaction_t *commit_transaction)
234 struct jbd2_inode *jinode;
236 struct address_space *mapping;
238 spin_lock(&journal->j_list_lock);
239 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
240 mapping = jinode->i_vfs_inode->i_mapping;
241 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
242 spin_unlock(&journal->j_list_lock);
244 * submit the inode data buffers. We use writepage
245 * instead of writepages. Because writepages can do
246 * block allocation with delalloc. We need to write
247 * only allocated blocks here.
249 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
250 err = journal_submit_inode_data_buffers(mapping);
253 spin_lock(&journal->j_list_lock);
254 J_ASSERT(jinode->i_transaction == commit_transaction);
255 commit_transaction->t_flushed_data_blocks = 1;
256 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
257 smp_mb__after_clear_bit();
258 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
260 spin_unlock(&journal->j_list_lock);
265 * Wait for data submitted for writeout, refile inodes to proper
266 * transaction if needed.
269 static int journal_finish_inode_data_buffers(journal_t *journal,
270 transaction_t *commit_transaction)
272 struct jbd2_inode *jinode, *next_i;
275 /* For locking, see the comment in journal_submit_data_buffers() */
276 spin_lock(&journal->j_list_lock);
277 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
278 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
279 spin_unlock(&journal->j_list_lock);
280 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
283 * Because AS_EIO is cleared by
284 * filemap_fdatawait_range(), set it again so
285 * that user process can get -EIO from fsync().
288 &jinode->i_vfs_inode->i_mapping->flags);
293 spin_lock(&journal->j_list_lock);
294 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
295 smp_mb__after_clear_bit();
296 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
299 /* Now refile inode to proper lists */
300 list_for_each_entry_safe(jinode, next_i,
301 &commit_transaction->t_inode_list, i_list) {
302 list_del(&jinode->i_list);
303 if (jinode->i_next_transaction) {
304 jinode->i_transaction = jinode->i_next_transaction;
305 jinode->i_next_transaction = NULL;
306 list_add(&jinode->i_list,
307 &jinode->i_transaction->t_inode_list);
309 jinode->i_transaction = NULL;
312 spin_unlock(&journal->j_list_lock);
317 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
319 struct page *page = bh->b_page;
323 addr = kmap_atomic(page, KM_USER0);
324 checksum = crc32_be(crc32_sum,
325 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
326 kunmap_atomic(addr, KM_USER0);
331 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
332 unsigned long long block)
334 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
335 if (tag_bytes > JBD2_TAG_SIZE32)
336 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
340 * jbd2_journal_commit_transaction
342 * The primary function for committing a transaction to the log. This
343 * function is called by the journal thread to begin a complete commit.
345 void jbd2_journal_commit_transaction(journal_t *journal)
347 struct transaction_stats_s stats;
348 transaction_t *commit_transaction;
349 struct journal_head *jh, *new_jh, *descriptor;
350 struct buffer_head **wbuf = journal->j_wbuf;
354 unsigned long long blocknr;
358 journal_header_t *header;
359 journal_block_tag_t *tag = NULL;
364 int tag_bytes = journal_tag_bytes(journal);
365 struct buffer_head *cbh = NULL; /* For transactional checksums */
366 __u32 crc32_sum = ~0;
367 int write_op = WRITE;
370 * First job: lock down the current transaction and wait for
371 * all outstanding updates to complete.
375 spin_lock(&journal->j_list_lock);
376 summarise_journal_usage(journal);
377 spin_unlock(&journal->j_list_lock);
380 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
381 if (journal->j_flags & JBD2_FLUSHED) {
382 jbd_debug(3, "super block updated\n");
383 jbd2_journal_update_superblock(journal, 1);
385 jbd_debug(3, "superblock not updated\n");
388 J_ASSERT(journal->j_running_transaction != NULL);
389 J_ASSERT(journal->j_committing_transaction == NULL);
391 commit_transaction = journal->j_running_transaction;
392 J_ASSERT(commit_transaction->t_state == T_RUNNING);
394 trace_jbd2_start_commit(journal, commit_transaction);
395 jbd_debug(1, "JBD: starting commit of transaction %d\n",
396 commit_transaction->t_tid);
398 write_lock(&journal->j_state_lock);
399 commit_transaction->t_state = T_LOCKED;
402 * Use plugged writes here, since we want to submit several before
403 * we unplug the device. We don't do explicit unplugging in here,
404 * instead we rely on sync_buffer() doing the unplug for us.
406 if (commit_transaction->t_synchronous_commit)
407 write_op = WRITE_SYNC_PLUG;
408 trace_jbd2_commit_locking(journal, commit_transaction);
409 stats.run.rs_wait = commit_transaction->t_max_wait;
410 stats.run.rs_locked = jiffies;
411 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
412 stats.run.rs_locked);
414 spin_lock(&commit_transaction->t_handle_lock);
415 while (atomic_read(&commit_transaction->t_updates)) {
418 prepare_to_wait(&journal->j_wait_updates, &wait,
419 TASK_UNINTERRUPTIBLE);
420 if (atomic_read(&commit_transaction->t_updates)) {
421 spin_unlock(&commit_transaction->t_handle_lock);
422 write_unlock(&journal->j_state_lock);
424 write_lock(&journal->j_state_lock);
425 spin_lock(&commit_transaction->t_handle_lock);
427 finish_wait(&journal->j_wait_updates, &wait);
429 spin_unlock(&commit_transaction->t_handle_lock);
431 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
432 journal->j_max_transaction_buffers);
435 * First thing we are allowed to do is to discard any remaining
436 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
437 * that there are no such buffers: if a large filesystem
438 * operation like a truncate needs to split itself over multiple
439 * transactions, then it may try to do a jbd2_journal_restart() while
440 * there are still BJ_Reserved buffers outstanding. These must
441 * be released cleanly from the current transaction.
443 * In this case, the filesystem must still reserve write access
444 * again before modifying the buffer in the new transaction, but
445 * we do not require it to remember exactly which old buffers it
446 * has reserved. This is consistent with the existing behaviour
447 * that multiple jbd2_journal_get_write_access() calls to the same
448 * buffer are perfectly permissable.
450 while (commit_transaction->t_reserved_list) {
451 jh = commit_transaction->t_reserved_list;
452 JBUFFER_TRACE(jh, "reserved, unused: refile");
454 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
455 * leave undo-committed data.
457 if (jh->b_committed_data) {
458 struct buffer_head *bh = jh2bh(jh);
460 jbd_lock_bh_state(bh);
461 jbd2_free(jh->b_committed_data, bh->b_size);
462 jh->b_committed_data = NULL;
463 jbd_unlock_bh_state(bh);
465 jbd2_journal_refile_buffer(journal, jh);
469 * Now try to drop any written-back buffers from the journal's
470 * checkpoint lists. We do this *before* commit because it potentially
473 spin_lock(&journal->j_list_lock);
474 __jbd2_journal_clean_checkpoint_list(journal);
475 spin_unlock(&journal->j_list_lock);
477 jbd_debug (3, "JBD: commit phase 1\n");
480 * Switch to a new revoke table.
482 jbd2_journal_switch_revoke_table(journal);
484 trace_jbd2_commit_flushing(journal, commit_transaction);
485 stats.run.rs_flushing = jiffies;
486 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
487 stats.run.rs_flushing);
489 commit_transaction->t_state = T_FLUSH;
490 journal->j_committing_transaction = commit_transaction;
491 journal->j_running_transaction = NULL;
492 start_time = ktime_get();
493 commit_transaction->t_log_start = journal->j_head;
494 wake_up(&journal->j_wait_transaction_locked);
495 write_unlock(&journal->j_state_lock);
497 jbd_debug (3, "JBD: commit phase 2\n");
500 * Now start flushing things to disk, in the order they appear
501 * on the transaction lists. Data blocks go first.
503 err = journal_submit_data_buffers(journal, commit_transaction);
505 jbd2_journal_abort(journal, err);
507 jbd2_journal_write_revoke_records(journal, commit_transaction,
510 jbd_debug(3, "JBD: commit phase 2\n");
513 * Way to go: we have now written out all of the data for a
514 * transaction! Now comes the tricky part: we need to write out
515 * metadata. Loop over the transaction's entire buffer list:
517 write_lock(&journal->j_state_lock);
518 commit_transaction->t_state = T_COMMIT;
519 write_unlock(&journal->j_state_lock);
521 trace_jbd2_commit_logging(journal, commit_transaction);
522 stats.run.rs_logging = jiffies;
523 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
524 stats.run.rs_logging);
525 stats.run.rs_blocks =
526 atomic_read(&commit_transaction->t_outstanding_credits);
527 stats.run.rs_blocks_logged = 0;
529 J_ASSERT(commit_transaction->t_nr_buffers <=
530 atomic_read(&commit_transaction->t_outstanding_credits));
535 while (commit_transaction->t_buffers) {
537 /* Find the next buffer to be journaled... */
539 jh = commit_transaction->t_buffers;
541 /* If we're in abort mode, we just un-journal the buffer and
544 if (is_journal_aborted(journal)) {
545 clear_buffer_jbddirty(jh2bh(jh));
546 JBUFFER_TRACE(jh, "journal is aborting: refile");
547 jbd2_buffer_abort_trigger(jh,
549 jh->b_frozen_triggers :
551 jbd2_journal_refile_buffer(journal, jh);
552 /* If that was the last one, we need to clean up
553 * any descriptor buffers which may have been
554 * already allocated, even if we are now
556 if (!commit_transaction->t_buffers)
557 goto start_journal_io;
561 /* Make sure we have a descriptor block in which to
562 record the metadata buffer. */
565 struct buffer_head *bh;
567 J_ASSERT (bufs == 0);
569 jbd_debug(4, "JBD: get descriptor\n");
571 descriptor = jbd2_journal_get_descriptor_buffer(journal);
573 jbd2_journal_abort(journal, -EIO);
577 bh = jh2bh(descriptor);
578 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
579 (unsigned long long)bh->b_blocknr, bh->b_data);
580 header = (journal_header_t *)&bh->b_data[0];
581 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
582 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
583 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
585 tagp = &bh->b_data[sizeof(journal_header_t)];
586 space_left = bh->b_size - sizeof(journal_header_t);
588 set_buffer_jwrite(bh);
589 set_buffer_dirty(bh);
592 /* Record it so that we can wait for IO
594 BUFFER_TRACE(bh, "ph3: file as descriptor");
595 jbd2_journal_file_buffer(descriptor, commit_transaction,
599 /* Where is the buffer to be written? */
601 err = jbd2_journal_next_log_block(journal, &blocknr);
602 /* If the block mapping failed, just abandon the buffer
603 and repeat this loop: we'll fall into the
604 refile-on-abort condition above. */
606 jbd2_journal_abort(journal, err);
611 * start_this_handle() uses t_outstanding_credits to determine
612 * the free space in the log, but this counter is changed
613 * by jbd2_journal_next_log_block() also.
615 atomic_dec(&commit_transaction->t_outstanding_credits);
617 /* Bump b_count to prevent truncate from stumbling over
618 the shadowed buffer! @@@ This can go if we ever get
619 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
620 atomic_inc(&jh2bh(jh)->b_count);
622 /* Make a temporary IO buffer with which to write it out
623 (this will requeue both the metadata buffer and the
624 temporary IO buffer). new_bh goes on BJ_IO*/
626 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
628 * akpm: jbd2_journal_write_metadata_buffer() sets
629 * new_bh->b_transaction to commit_transaction.
630 * We need to clean this up before we release new_bh
631 * (which is of type BJ_IO)
633 JBUFFER_TRACE(jh, "ph3: write metadata");
634 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
635 jh, &new_jh, blocknr);
637 jbd2_journal_abort(journal, flags);
640 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
641 wbuf[bufs++] = jh2bh(new_jh);
643 /* Record the new block's tag in the current descriptor
648 tag_flag |= JBD2_FLAG_ESCAPE;
650 tag_flag |= JBD2_FLAG_SAME_UUID;
652 tag = (journal_block_tag_t *) tagp;
653 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
654 tag->t_flags = cpu_to_be32(tag_flag);
656 space_left -= tag_bytes;
659 memcpy (tagp, journal->j_uuid, 16);
665 /* If there's no more to do, or if the descriptor is full,
668 if (bufs == journal->j_wbufsize ||
669 commit_transaction->t_buffers == NULL ||
670 space_left < tag_bytes + 16) {
672 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
674 /* Write an end-of-descriptor marker before
675 submitting the IOs. "tag" still points to
676 the last tag we set up. */
678 tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
681 for (i = 0; i < bufs; i++) {
682 struct buffer_head *bh = wbuf[i];
686 if (JBD2_HAS_COMPAT_FEATURE(journal,
687 JBD2_FEATURE_COMPAT_CHECKSUM)) {
689 jbd2_checksum_data(crc32_sum, bh);
693 clear_buffer_dirty(bh);
694 set_buffer_uptodate(bh);
695 bh->b_end_io = journal_end_buffer_io_sync;
696 submit_bh(write_op, bh);
699 stats.run.rs_blocks_logged += bufs;
701 /* Force a new descriptor to be generated next
702 time round the loop. */
709 * If the journal is not located on the file system device,
710 * then we must flush the file system device before we issue
713 if (commit_transaction->t_flushed_data_blocks &&
714 (journal->j_fs_dev != journal->j_dev) &&
715 (journal->j_flags & JBD2_BARRIER))
716 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
719 /* Done it all: now write the commit record asynchronously. */
720 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
721 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
722 err = journal_submit_commit_record(journal, commit_transaction,
725 __jbd2_journal_abort_hard(journal);
726 if (journal->j_flags & JBD2_BARRIER)
727 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
731 err = journal_finish_inode_data_buffers(journal, commit_transaction);
734 "JBD2: Detected IO errors while flushing file data "
735 "on %s\n", journal->j_devname);
736 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
737 jbd2_journal_abort(journal, err);
741 /* Lo and behold: we have just managed to send a transaction to
742 the log. Before we can commit it, wait for the IO so far to
743 complete. Control buffers being written are on the
744 transaction's t_log_list queue, and metadata buffers are on
745 the t_iobuf_list queue.
747 Wait for the buffers in reverse order. That way we are
748 less likely to be woken up until all IOs have completed, and
749 so we incur less scheduling load.
752 jbd_debug(3, "JBD: commit phase 3\n");
755 * akpm: these are BJ_IO, and j_list_lock is not needed.
756 * See __journal_try_to_free_buffer.
759 while (commit_transaction->t_iobuf_list != NULL) {
760 struct buffer_head *bh;
762 jh = commit_transaction->t_iobuf_list->b_tprev;
764 if (buffer_locked(bh)) {
771 if (unlikely(!buffer_uptodate(bh)))
774 clear_buffer_jwrite(bh);
776 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
777 jbd2_journal_unfile_buffer(journal, jh);
780 * ->t_iobuf_list should contain only dummy buffer_heads
781 * which were created by jbd2_journal_write_metadata_buffer().
783 BUFFER_TRACE(bh, "dumping temporary bh");
784 jbd2_journal_put_journal_head(jh);
786 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
787 free_buffer_head(bh);
789 /* We also have to unlock and free the corresponding
791 jh = commit_transaction->t_shadow_list->b_tprev;
793 clear_bit(BH_JWrite, &bh->b_state);
794 J_ASSERT_BH(bh, buffer_jbddirty(bh));
796 /* The metadata is now released for reuse, but we need
797 to remember it against this transaction so that when
798 we finally commit, we can do any checkpointing
800 JBUFFER_TRACE(jh, "file as BJ_Forget");
801 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
802 /* Wake up any transactions which were waiting for this
804 wake_up_bit(&bh->b_state, BH_Unshadow);
805 JBUFFER_TRACE(jh, "brelse shadowed buffer");
809 J_ASSERT (commit_transaction->t_shadow_list == NULL);
811 jbd_debug(3, "JBD: commit phase 4\n");
813 /* Here we wait for the revoke record and descriptor record buffers */
815 while (commit_transaction->t_log_list != NULL) {
816 struct buffer_head *bh;
818 jh = commit_transaction->t_log_list->b_tprev;
820 if (buffer_locked(bh)) {
822 goto wait_for_ctlbuf;
825 goto wait_for_ctlbuf;
827 if (unlikely(!buffer_uptodate(bh)))
830 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
831 clear_buffer_jwrite(bh);
832 jbd2_journal_unfile_buffer(journal, jh);
833 jbd2_journal_put_journal_head(jh);
834 __brelse(bh); /* One for getblk */
835 /* AKPM: bforget here */
839 jbd2_journal_abort(journal, err);
841 jbd_debug(3, "JBD: commit phase 5\n");
843 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
844 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
845 err = journal_submit_commit_record(journal, commit_transaction,
848 __jbd2_journal_abort_hard(journal);
850 if (!err && !is_journal_aborted(journal))
851 err = journal_wait_on_commit_record(journal, cbh);
854 jbd2_journal_abort(journal, err);
856 /* End of a transaction! Finally, we can do checkpoint
857 processing: any buffers committed as a result of this
858 transaction can be removed from any checkpoint list it was on
861 jbd_debug(3, "JBD: commit phase 6\n");
863 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
864 J_ASSERT(commit_transaction->t_buffers == NULL);
865 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
866 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
867 J_ASSERT(commit_transaction->t_shadow_list == NULL);
868 J_ASSERT(commit_transaction->t_log_list == NULL);
872 * As there are other places (journal_unmap_buffer()) adding buffers
873 * to this list we have to be careful and hold the j_list_lock.
875 spin_lock(&journal->j_list_lock);
876 while (commit_transaction->t_forget) {
877 transaction_t *cp_transaction;
878 struct buffer_head *bh;
880 jh = commit_transaction->t_forget;
881 spin_unlock(&journal->j_list_lock);
883 jbd_lock_bh_state(bh);
884 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
887 * If there is undo-protected committed data against
888 * this buffer, then we can remove it now. If it is a
889 * buffer needing such protection, the old frozen_data
890 * field now points to a committed version of the
891 * buffer, so rotate that field to the new committed
894 * Otherwise, we can just throw away the frozen data now.
896 * We also know that the frozen data has already fired
897 * its triggers if they exist, so we can clear that too.
899 if (jh->b_committed_data) {
900 jbd2_free(jh->b_committed_data, bh->b_size);
901 jh->b_committed_data = NULL;
902 if (jh->b_frozen_data) {
903 jh->b_committed_data = jh->b_frozen_data;
904 jh->b_frozen_data = NULL;
905 jh->b_frozen_triggers = NULL;
907 } else if (jh->b_frozen_data) {
908 jbd2_free(jh->b_frozen_data, bh->b_size);
909 jh->b_frozen_data = NULL;
910 jh->b_frozen_triggers = NULL;
913 spin_lock(&journal->j_list_lock);
914 cp_transaction = jh->b_cp_transaction;
915 if (cp_transaction) {
916 JBUFFER_TRACE(jh, "remove from old cp transaction");
917 cp_transaction->t_chp_stats.cs_dropped++;
918 __jbd2_journal_remove_checkpoint(jh);
921 /* Only re-checkpoint the buffer_head if it is marked
922 * dirty. If the buffer was added to the BJ_Forget list
923 * by jbd2_journal_forget, it may no longer be dirty and
924 * there's no point in keeping a checkpoint record for
927 /* A buffer which has been freed while still being
928 * journaled by a previous transaction may end up still
929 * being dirty here, but we want to avoid writing back
930 * that buffer in the future after the "add to orphan"
931 * operation been committed, That's not only a performance
932 * gain, it also stops aliasing problems if the buffer is
933 * left behind for writeback and gets reallocated for another
934 * use in a different page. */
935 if (buffer_freed(bh) && !jh->b_next_transaction) {
936 clear_buffer_freed(bh);
937 clear_buffer_jbddirty(bh);
940 if (buffer_jbddirty(bh)) {
941 JBUFFER_TRACE(jh, "add to new checkpointing trans");
942 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
943 if (is_journal_aborted(journal))
944 clear_buffer_jbddirty(bh);
945 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
946 __jbd2_journal_refile_buffer(jh);
947 jbd_unlock_bh_state(bh);
949 J_ASSERT_BH(bh, !buffer_dirty(bh));
950 /* The buffer on BJ_Forget list and not jbddirty means
951 * it has been freed by this transaction and hence it
952 * could not have been reallocated until this
953 * transaction has committed. *BUT* it could be
954 * reallocated once we have written all the data to
955 * disk and before we process the buffer on BJ_Forget
957 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
958 __jbd2_journal_refile_buffer(jh);
959 if (!jh->b_transaction) {
960 jbd_unlock_bh_state(bh);
962 jbd2_journal_remove_journal_head(bh);
963 release_buffer_page(bh);
965 jbd_unlock_bh_state(bh);
967 cond_resched_lock(&journal->j_list_lock);
969 spin_unlock(&journal->j_list_lock);
971 * This is a bit sleazy. We use j_list_lock to protect transition
972 * of a transaction into T_FINISHED state and calling
973 * __jbd2_journal_drop_transaction(). Otherwise we could race with
974 * other checkpointing code processing the transaction...
976 write_lock(&journal->j_state_lock);
977 spin_lock(&journal->j_list_lock);
979 * Now recheck if some buffers did not get attached to the transaction
980 * while the lock was dropped...
982 if (commit_transaction->t_forget) {
983 spin_unlock(&journal->j_list_lock);
984 write_unlock(&journal->j_state_lock);
988 /* Done with this transaction! */
990 jbd_debug(3, "JBD: commit phase 7\n");
992 J_ASSERT(commit_transaction->t_state == T_COMMIT);
994 commit_transaction->t_start = jiffies;
995 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
996 commit_transaction->t_start);
999 * File the transaction statistics
1001 stats.ts_tid = commit_transaction->t_tid;
1002 stats.run.rs_handle_count =
1003 atomic_read(&commit_transaction->t_handle_count);
1004 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1005 commit_transaction->t_tid, &stats.run);
1008 * Calculate overall stats
1010 spin_lock(&journal->j_history_lock);
1011 journal->j_stats.ts_tid++;
1012 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1013 journal->j_stats.run.rs_running += stats.run.rs_running;
1014 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1015 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1016 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1017 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1018 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1019 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1020 spin_unlock(&journal->j_history_lock);
1022 commit_transaction->t_state = T_FINISHED;
1023 J_ASSERT(commit_transaction == journal->j_committing_transaction);
1024 journal->j_commit_sequence = commit_transaction->t_tid;
1025 journal->j_committing_transaction = NULL;
1026 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1029 * weight the commit time higher than the average time so we don't
1030 * react too strongly to vast changes in the commit time
1032 if (likely(journal->j_average_commit_time))
1033 journal->j_average_commit_time = (commit_time +
1034 journal->j_average_commit_time*3) / 4;
1036 journal->j_average_commit_time = commit_time;
1037 write_unlock(&journal->j_state_lock);
1039 if (commit_transaction->t_checkpoint_list == NULL &&
1040 commit_transaction->t_checkpoint_io_list == NULL) {
1041 __jbd2_journal_drop_transaction(journal, commit_transaction);
1044 if (journal->j_checkpoint_transactions == NULL) {
1045 journal->j_checkpoint_transactions = commit_transaction;
1046 commit_transaction->t_cpnext = commit_transaction;
1047 commit_transaction->t_cpprev = commit_transaction;
1049 commit_transaction->t_cpnext =
1050 journal->j_checkpoint_transactions;
1051 commit_transaction->t_cpprev =
1052 commit_transaction->t_cpnext->t_cpprev;
1053 commit_transaction->t_cpnext->t_cpprev =
1055 commit_transaction->t_cpprev->t_cpnext =
1059 spin_unlock(&journal->j_list_lock);
1061 if (journal->j_commit_callback)
1062 journal->j_commit_callback(journal, commit_transaction);
1064 trace_jbd2_end_commit(journal, commit_transaction);
1065 jbd_debug(1, "JBD: commit %d complete, head %d\n",
1066 journal->j_commit_sequence, journal->j_tail_sequence);
1068 kfree(commit_transaction);
1070 wake_up(&journal->j_wait_done_commit);