Merge branch 'i2c-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jdelvar...
[pandora-kernel.git] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 #include <asm/system.h>
32
33 /*
34  * Default IO end handler for temporary BJ_IO buffer_heads.
35  */
36 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37 {
38         BUFFER_TRACE(bh, "");
39         if (uptodate)
40                 set_buffer_uptodate(bh);
41         else
42                 clear_buffer_uptodate(bh);
43         unlock_buffer(bh);
44 }
45
46 /*
47  * When an ext4 file is truncated, it is possible that some pages are not
48  * successfully freed, because they are attached to a committing transaction.
49  * After the transaction commits, these pages are left on the LRU, with no
50  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
51  * by the VM, but their apparent absence upsets the VM accounting, and it makes
52  * the numbers in /proc/meminfo look odd.
53  *
54  * So here, we have a buffer which has just come off the forget list.  Look to
55  * see if we can strip all buffers from the backing page.
56  *
57  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
58  * caller provided us with a ref against the buffer, and we drop that here.
59  */
60 static void release_buffer_page(struct buffer_head *bh)
61 {
62         struct page *page;
63
64         if (buffer_dirty(bh))
65                 goto nope;
66         if (atomic_read(&bh->b_count) != 1)
67                 goto nope;
68         page = bh->b_page;
69         if (!page)
70                 goto nope;
71         if (page->mapping)
72                 goto nope;
73
74         /* OK, it's a truncated page */
75         if (!trylock_page(page))
76                 goto nope;
77
78         page_cache_get(page);
79         __brelse(bh);
80         try_to_free_buffers(page);
81         unlock_page(page);
82         page_cache_release(page);
83         return;
84
85 nope:
86         __brelse(bh);
87 }
88
89 /*
90  * Done it all: now submit the commit record.  We should have
91  * cleaned up our previous buffers by now, so if we are in abort
92  * mode we can now just skip the rest of the journal write
93  * entirely.
94  *
95  * Returns 1 if the journal needs to be aborted or 0 on success
96  */
97 static int journal_submit_commit_record(journal_t *journal,
98                                         transaction_t *commit_transaction,
99                                         struct buffer_head **cbh,
100                                         __u32 crc32_sum)
101 {
102         struct journal_head *descriptor;
103         struct commit_header *tmp;
104         struct buffer_head *bh;
105         int ret;
106         struct timespec now = current_kernel_time();
107
108         *cbh = NULL;
109
110         if (is_journal_aborted(journal))
111                 return 0;
112
113         descriptor = jbd2_journal_get_descriptor_buffer(journal);
114         if (!descriptor)
115                 return 1;
116
117         bh = jh2bh(descriptor);
118
119         tmp = (struct commit_header *)bh->b_data;
120         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
121         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
122         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
123         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
124         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
125
126         if (JBD2_HAS_COMPAT_FEATURE(journal,
127                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
128                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
129                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
130                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
131         }
132
133         JBUFFER_TRACE(descriptor, "submit commit block");
134         lock_buffer(bh);
135         clear_buffer_dirty(bh);
136         set_buffer_uptodate(bh);
137         bh->b_end_io = journal_end_buffer_io_sync;
138
139         if (journal->j_flags & JBD2_BARRIER &&
140             !JBD2_HAS_INCOMPAT_FEATURE(journal,
141                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
142                 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
143         else
144                 ret = submit_bh(WRITE_SYNC, bh);
145
146         *cbh = bh;
147         return ret;
148 }
149
150 /*
151  * This function along with journal_submit_commit_record
152  * allows to write the commit record asynchronously.
153  */
154 static int journal_wait_on_commit_record(journal_t *journal,
155                                          struct buffer_head *bh)
156 {
157         int ret = 0;
158
159         clear_buffer_dirty(bh);
160         wait_on_buffer(bh);
161
162         if (unlikely(!buffer_uptodate(bh)))
163                 ret = -EIO;
164         put_bh(bh);            /* One for getblk() */
165         jbd2_journal_put_journal_head(bh2jh(bh));
166
167         return ret;
168 }
169
170 /*
171  * write the filemap data using writepage() address_space_operations.
172  * We don't do block allocation here even for delalloc. We don't
173  * use writepages() because with dealyed allocation we may be doing
174  * block allocation in writepages().
175  */
176 static int journal_submit_inode_data_buffers(struct address_space *mapping)
177 {
178         int ret;
179         struct writeback_control wbc = {
180                 .sync_mode =  WB_SYNC_ALL,
181                 .nr_to_write = mapping->nrpages * 2,
182                 .range_start = 0,
183                 .range_end = i_size_read(mapping->host),
184         };
185
186         ret = generic_writepages(mapping, &wbc);
187         return ret;
188 }
189
190 /*
191  * Submit all the data buffers of inode associated with the transaction to
192  * disk.
193  *
194  * We are in a committing transaction. Therefore no new inode can be added to
195  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
196  * operate on from being released while we write out pages.
197  */
198 static int journal_submit_data_buffers(journal_t *journal,
199                 transaction_t *commit_transaction)
200 {
201         struct jbd2_inode *jinode;
202         int err, ret = 0;
203         struct address_space *mapping;
204
205         spin_lock(&journal->j_list_lock);
206         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
207                 mapping = jinode->i_vfs_inode->i_mapping;
208                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
209                 spin_unlock(&journal->j_list_lock);
210                 /*
211                  * submit the inode data buffers. We use writepage
212                  * instead of writepages. Because writepages can do
213                  * block allocation  with delalloc. We need to write
214                  * only allocated blocks here.
215                  */
216                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
217                 err = journal_submit_inode_data_buffers(mapping);
218                 if (!ret)
219                         ret = err;
220                 spin_lock(&journal->j_list_lock);
221                 J_ASSERT(jinode->i_transaction == commit_transaction);
222                 commit_transaction->t_flushed_data_blocks = 1;
223                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
224                 smp_mb__after_clear_bit();
225                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
226         }
227         spin_unlock(&journal->j_list_lock);
228         return ret;
229 }
230
231 /*
232  * Wait for data submitted for writeout, refile inodes to proper
233  * transaction if needed.
234  *
235  */
236 static int journal_finish_inode_data_buffers(journal_t *journal,
237                 transaction_t *commit_transaction)
238 {
239         struct jbd2_inode *jinode, *next_i;
240         int err, ret = 0;
241
242         /* For locking, see the comment in journal_submit_data_buffers() */
243         spin_lock(&journal->j_list_lock);
244         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
245                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
246                 spin_unlock(&journal->j_list_lock);
247                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
248                 if (err) {
249                         /*
250                          * Because AS_EIO is cleared by
251                          * filemap_fdatawait_range(), set it again so
252                          * that user process can get -EIO from fsync().
253                          */
254                         set_bit(AS_EIO,
255                                 &jinode->i_vfs_inode->i_mapping->flags);
256
257                         if (!ret)
258                                 ret = err;
259                 }
260                 spin_lock(&journal->j_list_lock);
261                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
262                 smp_mb__after_clear_bit();
263                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
264         }
265
266         /* Now refile inode to proper lists */
267         list_for_each_entry_safe(jinode, next_i,
268                                  &commit_transaction->t_inode_list, i_list) {
269                 list_del(&jinode->i_list);
270                 if (jinode->i_next_transaction) {
271                         jinode->i_transaction = jinode->i_next_transaction;
272                         jinode->i_next_transaction = NULL;
273                         list_add(&jinode->i_list,
274                                 &jinode->i_transaction->t_inode_list);
275                 } else {
276                         jinode->i_transaction = NULL;
277                 }
278         }
279         spin_unlock(&journal->j_list_lock);
280
281         return ret;
282 }
283
284 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
285 {
286         struct page *page = bh->b_page;
287         char *addr;
288         __u32 checksum;
289
290         addr = kmap_atomic(page, KM_USER0);
291         checksum = crc32_be(crc32_sum,
292                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
293         kunmap_atomic(addr, KM_USER0);
294
295         return checksum;
296 }
297
298 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
299                                    unsigned long long block)
300 {
301         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
302         if (tag_bytes > JBD2_TAG_SIZE32)
303                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
304 }
305
306 /*
307  * jbd2_journal_commit_transaction
308  *
309  * The primary function for committing a transaction to the log.  This
310  * function is called by the journal thread to begin a complete commit.
311  */
312 void jbd2_journal_commit_transaction(journal_t *journal)
313 {
314         struct transaction_stats_s stats;
315         transaction_t *commit_transaction;
316         struct journal_head *jh, *new_jh, *descriptor;
317         struct buffer_head **wbuf = journal->j_wbuf;
318         int bufs;
319         int flags;
320         int err;
321         unsigned long long blocknr;
322         ktime_t start_time;
323         u64 commit_time;
324         char *tagp = NULL;
325         journal_header_t *header;
326         journal_block_tag_t *tag = NULL;
327         int space_left = 0;
328         int first_tag = 0;
329         int tag_flag;
330         int i, to_free = 0;
331         int tag_bytes = journal_tag_bytes(journal);
332         struct buffer_head *cbh = NULL; /* For transactional checksums */
333         __u32 crc32_sum = ~0;
334         struct blk_plug plug;
335
336         /*
337          * First job: lock down the current transaction and wait for
338          * all outstanding updates to complete.
339          */
340
341         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
342         if (journal->j_flags & JBD2_FLUSHED) {
343                 jbd_debug(3, "super block updated\n");
344                 jbd2_journal_update_superblock(journal, 1);
345         } else {
346                 jbd_debug(3, "superblock not updated\n");
347         }
348
349         J_ASSERT(journal->j_running_transaction != NULL);
350         J_ASSERT(journal->j_committing_transaction == NULL);
351
352         commit_transaction = journal->j_running_transaction;
353         J_ASSERT(commit_transaction->t_state == T_RUNNING);
354
355         trace_jbd2_start_commit(journal, commit_transaction);
356         jbd_debug(1, "JBD: starting commit of transaction %d\n",
357                         commit_transaction->t_tid);
358
359         write_lock(&journal->j_state_lock);
360         commit_transaction->t_state = T_LOCKED;
361
362         trace_jbd2_commit_locking(journal, commit_transaction);
363         stats.run.rs_wait = commit_transaction->t_max_wait;
364         stats.run.rs_locked = jiffies;
365         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
366                                               stats.run.rs_locked);
367
368         spin_lock(&commit_transaction->t_handle_lock);
369         while (atomic_read(&commit_transaction->t_updates)) {
370                 DEFINE_WAIT(wait);
371
372                 prepare_to_wait(&journal->j_wait_updates, &wait,
373                                         TASK_UNINTERRUPTIBLE);
374                 if (atomic_read(&commit_transaction->t_updates)) {
375                         spin_unlock(&commit_transaction->t_handle_lock);
376                         write_unlock(&journal->j_state_lock);
377                         schedule();
378                         write_lock(&journal->j_state_lock);
379                         spin_lock(&commit_transaction->t_handle_lock);
380                 }
381                 finish_wait(&journal->j_wait_updates, &wait);
382         }
383         spin_unlock(&commit_transaction->t_handle_lock);
384
385         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
386                         journal->j_max_transaction_buffers);
387
388         /*
389          * First thing we are allowed to do is to discard any remaining
390          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
391          * that there are no such buffers: if a large filesystem
392          * operation like a truncate needs to split itself over multiple
393          * transactions, then it may try to do a jbd2_journal_restart() while
394          * there are still BJ_Reserved buffers outstanding.  These must
395          * be released cleanly from the current transaction.
396          *
397          * In this case, the filesystem must still reserve write access
398          * again before modifying the buffer in the new transaction, but
399          * we do not require it to remember exactly which old buffers it
400          * has reserved.  This is consistent with the existing behaviour
401          * that multiple jbd2_journal_get_write_access() calls to the same
402          * buffer are perfectly permissible.
403          */
404         while (commit_transaction->t_reserved_list) {
405                 jh = commit_transaction->t_reserved_list;
406                 JBUFFER_TRACE(jh, "reserved, unused: refile");
407                 /*
408                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
409                  * leave undo-committed data.
410                  */
411                 if (jh->b_committed_data) {
412                         struct buffer_head *bh = jh2bh(jh);
413
414                         jbd_lock_bh_state(bh);
415                         jbd2_free(jh->b_committed_data, bh->b_size);
416                         jh->b_committed_data = NULL;
417                         jbd_unlock_bh_state(bh);
418                 }
419                 jbd2_journal_refile_buffer(journal, jh);
420         }
421
422         /*
423          * Now try to drop any written-back buffers from the journal's
424          * checkpoint lists.  We do this *before* commit because it potentially
425          * frees some memory
426          */
427         spin_lock(&journal->j_list_lock);
428         __jbd2_journal_clean_checkpoint_list(journal);
429         spin_unlock(&journal->j_list_lock);
430
431         jbd_debug (3, "JBD: commit phase 1\n");
432
433         /*
434          * Switch to a new revoke table.
435          */
436         jbd2_journal_switch_revoke_table(journal);
437
438         trace_jbd2_commit_flushing(journal, commit_transaction);
439         stats.run.rs_flushing = jiffies;
440         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
441                                              stats.run.rs_flushing);
442
443         commit_transaction->t_state = T_FLUSH;
444         journal->j_committing_transaction = commit_transaction;
445         journal->j_running_transaction = NULL;
446         start_time = ktime_get();
447         commit_transaction->t_log_start = journal->j_head;
448         wake_up(&journal->j_wait_transaction_locked);
449         write_unlock(&journal->j_state_lock);
450
451         jbd_debug (3, "JBD: commit phase 2\n");
452
453         /*
454          * Now start flushing things to disk, in the order they appear
455          * on the transaction lists.  Data blocks go first.
456          */
457         err = journal_submit_data_buffers(journal, commit_transaction);
458         if (err)
459                 jbd2_journal_abort(journal, err);
460
461         blk_start_plug(&plug);
462         jbd2_journal_write_revoke_records(journal, commit_transaction,
463                                           WRITE_SYNC);
464         blk_finish_plug(&plug);
465
466         jbd_debug(3, "JBD: commit phase 2\n");
467
468         /*
469          * Way to go: we have now written out all of the data for a
470          * transaction!  Now comes the tricky part: we need to write out
471          * metadata.  Loop over the transaction's entire buffer list:
472          */
473         write_lock(&journal->j_state_lock);
474         commit_transaction->t_state = T_COMMIT;
475         write_unlock(&journal->j_state_lock);
476
477         trace_jbd2_commit_logging(journal, commit_transaction);
478         stats.run.rs_logging = jiffies;
479         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
480                                                stats.run.rs_logging);
481         stats.run.rs_blocks =
482                 atomic_read(&commit_transaction->t_outstanding_credits);
483         stats.run.rs_blocks_logged = 0;
484
485         J_ASSERT(commit_transaction->t_nr_buffers <=
486                  atomic_read(&commit_transaction->t_outstanding_credits));
487
488         err = 0;
489         descriptor = NULL;
490         bufs = 0;
491         blk_start_plug(&plug);
492         while (commit_transaction->t_buffers) {
493
494                 /* Find the next buffer to be journaled... */
495
496                 jh = commit_transaction->t_buffers;
497
498                 /* If we're in abort mode, we just un-journal the buffer and
499                    release it. */
500
501                 if (is_journal_aborted(journal)) {
502                         clear_buffer_jbddirty(jh2bh(jh));
503                         JBUFFER_TRACE(jh, "journal is aborting: refile");
504                         jbd2_buffer_abort_trigger(jh,
505                                                   jh->b_frozen_data ?
506                                                   jh->b_frozen_triggers :
507                                                   jh->b_triggers);
508                         jbd2_journal_refile_buffer(journal, jh);
509                         /* If that was the last one, we need to clean up
510                          * any descriptor buffers which may have been
511                          * already allocated, even if we are now
512                          * aborting. */
513                         if (!commit_transaction->t_buffers)
514                                 goto start_journal_io;
515                         continue;
516                 }
517
518                 /* Make sure we have a descriptor block in which to
519                    record the metadata buffer. */
520
521                 if (!descriptor) {
522                         struct buffer_head *bh;
523
524                         J_ASSERT (bufs == 0);
525
526                         jbd_debug(4, "JBD: get descriptor\n");
527
528                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
529                         if (!descriptor) {
530                                 jbd2_journal_abort(journal, -EIO);
531                                 continue;
532                         }
533
534                         bh = jh2bh(descriptor);
535                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
536                                 (unsigned long long)bh->b_blocknr, bh->b_data);
537                         header = (journal_header_t *)&bh->b_data[0];
538                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
539                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
540                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
541
542                         tagp = &bh->b_data[sizeof(journal_header_t)];
543                         space_left = bh->b_size - sizeof(journal_header_t);
544                         first_tag = 1;
545                         set_buffer_jwrite(bh);
546                         set_buffer_dirty(bh);
547                         wbuf[bufs++] = bh;
548
549                         /* Record it so that we can wait for IO
550                            completion later */
551                         BUFFER_TRACE(bh, "ph3: file as descriptor");
552                         jbd2_journal_file_buffer(descriptor, commit_transaction,
553                                         BJ_LogCtl);
554                 }
555
556                 /* Where is the buffer to be written? */
557
558                 err = jbd2_journal_next_log_block(journal, &blocknr);
559                 /* If the block mapping failed, just abandon the buffer
560                    and repeat this loop: we'll fall into the
561                    refile-on-abort condition above. */
562                 if (err) {
563                         jbd2_journal_abort(journal, err);
564                         continue;
565                 }
566
567                 /*
568                  * start_this_handle() uses t_outstanding_credits to determine
569                  * the free space in the log, but this counter is changed
570                  * by jbd2_journal_next_log_block() also.
571                  */
572                 atomic_dec(&commit_transaction->t_outstanding_credits);
573
574                 /* Bump b_count to prevent truncate from stumbling over
575                    the shadowed buffer!  @@@ This can go if we ever get
576                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
577                 atomic_inc(&jh2bh(jh)->b_count);
578
579                 /* Make a temporary IO buffer with which to write it out
580                    (this will requeue both the metadata buffer and the
581                    temporary IO buffer). new_bh goes on BJ_IO*/
582
583                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
584                 /*
585                  * akpm: jbd2_journal_write_metadata_buffer() sets
586                  * new_bh->b_transaction to commit_transaction.
587                  * We need to clean this up before we release new_bh
588                  * (which is of type BJ_IO)
589                  */
590                 JBUFFER_TRACE(jh, "ph3: write metadata");
591                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
592                                                       jh, &new_jh, blocknr);
593                 if (flags < 0) {
594                         jbd2_journal_abort(journal, flags);
595                         continue;
596                 }
597                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
598                 wbuf[bufs++] = jh2bh(new_jh);
599
600                 /* Record the new block's tag in the current descriptor
601                    buffer */
602
603                 tag_flag = 0;
604                 if (flags & 1)
605                         tag_flag |= JBD2_FLAG_ESCAPE;
606                 if (!first_tag)
607                         tag_flag |= JBD2_FLAG_SAME_UUID;
608
609                 tag = (journal_block_tag_t *) tagp;
610                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
611                 tag->t_flags = cpu_to_be32(tag_flag);
612                 tagp += tag_bytes;
613                 space_left -= tag_bytes;
614
615                 if (first_tag) {
616                         memcpy (tagp, journal->j_uuid, 16);
617                         tagp += 16;
618                         space_left -= 16;
619                         first_tag = 0;
620                 }
621
622                 /* If there's no more to do, or if the descriptor is full,
623                    let the IO rip! */
624
625                 if (bufs == journal->j_wbufsize ||
626                     commit_transaction->t_buffers == NULL ||
627                     space_left < tag_bytes + 16) {
628
629                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
630
631                         /* Write an end-of-descriptor marker before
632                            submitting the IOs.  "tag" still points to
633                            the last tag we set up. */
634
635                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
636
637 start_journal_io:
638                         for (i = 0; i < bufs; i++) {
639                                 struct buffer_head *bh = wbuf[i];
640                                 /*
641                                  * Compute checksum.
642                                  */
643                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
644                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
645                                         crc32_sum =
646                                             jbd2_checksum_data(crc32_sum, bh);
647                                 }
648
649                                 lock_buffer(bh);
650                                 clear_buffer_dirty(bh);
651                                 set_buffer_uptodate(bh);
652                                 bh->b_end_io = journal_end_buffer_io_sync;
653                                 submit_bh(WRITE_SYNC, bh);
654                         }
655                         cond_resched();
656                         stats.run.rs_blocks_logged += bufs;
657
658                         /* Force a new descriptor to be generated next
659                            time round the loop. */
660                         descriptor = NULL;
661                         bufs = 0;
662                 }
663         }
664
665         err = journal_finish_inode_data_buffers(journal, commit_transaction);
666         if (err) {
667                 printk(KERN_WARNING
668                         "JBD2: Detected IO errors while flushing file data "
669                        "on %s\n", journal->j_devname);
670                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
671                         jbd2_journal_abort(journal, err);
672                 err = 0;
673         }
674
675         /* 
676          * If the journal is not located on the file system device,
677          * then we must flush the file system device before we issue
678          * the commit record
679          */
680         if (commit_transaction->t_flushed_data_blocks &&
681             (journal->j_fs_dev != journal->j_dev) &&
682             (journal->j_flags & JBD2_BARRIER))
683                 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
684
685         /* Done it all: now write the commit record asynchronously. */
686         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
687                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
688                 err = journal_submit_commit_record(journal, commit_transaction,
689                                                  &cbh, crc32_sum);
690                 if (err)
691                         __jbd2_journal_abort_hard(journal);
692         }
693
694         blk_finish_plug(&plug);
695
696         /* Lo and behold: we have just managed to send a transaction to
697            the log.  Before we can commit it, wait for the IO so far to
698            complete.  Control buffers being written are on the
699            transaction's t_log_list queue, and metadata buffers are on
700            the t_iobuf_list queue.
701
702            Wait for the buffers in reverse order.  That way we are
703            less likely to be woken up until all IOs have completed, and
704            so we incur less scheduling load.
705         */
706
707         jbd_debug(3, "JBD: commit phase 3\n");
708
709         /*
710          * akpm: these are BJ_IO, and j_list_lock is not needed.
711          * See __journal_try_to_free_buffer.
712          */
713 wait_for_iobuf:
714         while (commit_transaction->t_iobuf_list != NULL) {
715                 struct buffer_head *bh;
716
717                 jh = commit_transaction->t_iobuf_list->b_tprev;
718                 bh = jh2bh(jh);
719                 if (buffer_locked(bh)) {
720                         wait_on_buffer(bh);
721                         goto wait_for_iobuf;
722                 }
723                 if (cond_resched())
724                         goto wait_for_iobuf;
725
726                 if (unlikely(!buffer_uptodate(bh)))
727                         err = -EIO;
728
729                 clear_buffer_jwrite(bh);
730
731                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
732                 jbd2_journal_unfile_buffer(journal, jh);
733
734                 /*
735                  * ->t_iobuf_list should contain only dummy buffer_heads
736                  * which were created by jbd2_journal_write_metadata_buffer().
737                  */
738                 BUFFER_TRACE(bh, "dumping temporary bh");
739                 jbd2_journal_put_journal_head(jh);
740                 __brelse(bh);
741                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
742                 free_buffer_head(bh);
743
744                 /* We also have to unlock and free the corresponding
745                    shadowed buffer */
746                 jh = commit_transaction->t_shadow_list->b_tprev;
747                 bh = jh2bh(jh);
748                 clear_bit(BH_JWrite, &bh->b_state);
749                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
750
751                 /* The metadata is now released for reuse, but we need
752                    to remember it against this transaction so that when
753                    we finally commit, we can do any checkpointing
754                    required. */
755                 JBUFFER_TRACE(jh, "file as BJ_Forget");
756                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
757                 /* Wake up any transactions which were waiting for this
758                    IO to complete */
759                 wake_up_bit(&bh->b_state, BH_Unshadow);
760                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
761                 __brelse(bh);
762         }
763
764         J_ASSERT (commit_transaction->t_shadow_list == NULL);
765
766         jbd_debug(3, "JBD: commit phase 4\n");
767
768         /* Here we wait for the revoke record and descriptor record buffers */
769  wait_for_ctlbuf:
770         while (commit_transaction->t_log_list != NULL) {
771                 struct buffer_head *bh;
772
773                 jh = commit_transaction->t_log_list->b_tprev;
774                 bh = jh2bh(jh);
775                 if (buffer_locked(bh)) {
776                         wait_on_buffer(bh);
777                         goto wait_for_ctlbuf;
778                 }
779                 if (cond_resched())
780                         goto wait_for_ctlbuf;
781
782                 if (unlikely(!buffer_uptodate(bh)))
783                         err = -EIO;
784
785                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
786                 clear_buffer_jwrite(bh);
787                 jbd2_journal_unfile_buffer(journal, jh);
788                 jbd2_journal_put_journal_head(jh);
789                 __brelse(bh);           /* One for getblk */
790                 /* AKPM: bforget here */
791         }
792
793         if (err)
794                 jbd2_journal_abort(journal, err);
795
796         jbd_debug(3, "JBD: commit phase 5\n");
797
798         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
799                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
800                 err = journal_submit_commit_record(journal, commit_transaction,
801                                                 &cbh, crc32_sum);
802                 if (err)
803                         __jbd2_journal_abort_hard(journal);
804         }
805         if (cbh)
806                 err = journal_wait_on_commit_record(journal, cbh);
807         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
808                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
809             journal->j_flags & JBD2_BARRIER) {
810                 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
811         }
812
813         if (err)
814                 jbd2_journal_abort(journal, err);
815
816         /* End of a transaction!  Finally, we can do checkpoint
817            processing: any buffers committed as a result of this
818            transaction can be removed from any checkpoint list it was on
819            before. */
820
821         jbd_debug(3, "JBD: commit phase 6\n");
822
823         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
824         J_ASSERT(commit_transaction->t_buffers == NULL);
825         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
826         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
827         J_ASSERT(commit_transaction->t_shadow_list == NULL);
828         J_ASSERT(commit_transaction->t_log_list == NULL);
829
830 restart_loop:
831         /*
832          * As there are other places (journal_unmap_buffer()) adding buffers
833          * to this list we have to be careful and hold the j_list_lock.
834          */
835         spin_lock(&journal->j_list_lock);
836         while (commit_transaction->t_forget) {
837                 transaction_t *cp_transaction;
838                 struct buffer_head *bh;
839
840                 jh = commit_transaction->t_forget;
841                 spin_unlock(&journal->j_list_lock);
842                 bh = jh2bh(jh);
843                 jbd_lock_bh_state(bh);
844                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
845
846                 /*
847                  * If there is undo-protected committed data against
848                  * this buffer, then we can remove it now.  If it is a
849                  * buffer needing such protection, the old frozen_data
850                  * field now points to a committed version of the
851                  * buffer, so rotate that field to the new committed
852                  * data.
853                  *
854                  * Otherwise, we can just throw away the frozen data now.
855                  *
856                  * We also know that the frozen data has already fired
857                  * its triggers if they exist, so we can clear that too.
858                  */
859                 if (jh->b_committed_data) {
860                         jbd2_free(jh->b_committed_data, bh->b_size);
861                         jh->b_committed_data = NULL;
862                         if (jh->b_frozen_data) {
863                                 jh->b_committed_data = jh->b_frozen_data;
864                                 jh->b_frozen_data = NULL;
865                                 jh->b_frozen_triggers = NULL;
866                         }
867                 } else if (jh->b_frozen_data) {
868                         jbd2_free(jh->b_frozen_data, bh->b_size);
869                         jh->b_frozen_data = NULL;
870                         jh->b_frozen_triggers = NULL;
871                 }
872
873                 spin_lock(&journal->j_list_lock);
874                 cp_transaction = jh->b_cp_transaction;
875                 if (cp_transaction) {
876                         JBUFFER_TRACE(jh, "remove from old cp transaction");
877                         cp_transaction->t_chp_stats.cs_dropped++;
878                         __jbd2_journal_remove_checkpoint(jh);
879                 }
880
881                 /* Only re-checkpoint the buffer_head if it is marked
882                  * dirty.  If the buffer was added to the BJ_Forget list
883                  * by jbd2_journal_forget, it may no longer be dirty and
884                  * there's no point in keeping a checkpoint record for
885                  * it. */
886
887                 /* A buffer which has been freed while still being
888                  * journaled by a previous transaction may end up still
889                  * being dirty here, but we want to avoid writing back
890                  * that buffer in the future after the "add to orphan"
891                  * operation been committed,  That's not only a performance
892                  * gain, it also stops aliasing problems if the buffer is
893                  * left behind for writeback and gets reallocated for another
894                  * use in a different page. */
895                 if (buffer_freed(bh) && !jh->b_next_transaction) {
896                         clear_buffer_freed(bh);
897                         clear_buffer_jbddirty(bh);
898                 }
899
900                 if (buffer_jbddirty(bh)) {
901                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
902                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
903                         if (is_journal_aborted(journal))
904                                 clear_buffer_jbddirty(bh);
905                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
906                         __jbd2_journal_refile_buffer(jh);
907                         jbd_unlock_bh_state(bh);
908                 } else {
909                         J_ASSERT_BH(bh, !buffer_dirty(bh));
910                         /* The buffer on BJ_Forget list and not jbddirty means
911                          * it has been freed by this transaction and hence it
912                          * could not have been reallocated until this
913                          * transaction has committed. *BUT* it could be
914                          * reallocated once we have written all the data to
915                          * disk and before we process the buffer on BJ_Forget
916                          * list. */
917                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
918                         __jbd2_journal_refile_buffer(jh);
919                         if (!jh->b_transaction) {
920                                 jbd_unlock_bh_state(bh);
921                                  /* needs a brelse */
922                                 jbd2_journal_remove_journal_head(bh);
923                                 release_buffer_page(bh);
924                         } else
925                                 jbd_unlock_bh_state(bh);
926                 }
927                 cond_resched_lock(&journal->j_list_lock);
928         }
929         spin_unlock(&journal->j_list_lock);
930         /*
931          * This is a bit sleazy.  We use j_list_lock to protect transition
932          * of a transaction into T_FINISHED state and calling
933          * __jbd2_journal_drop_transaction(). Otherwise we could race with
934          * other checkpointing code processing the transaction...
935          */
936         write_lock(&journal->j_state_lock);
937         spin_lock(&journal->j_list_lock);
938         /*
939          * Now recheck if some buffers did not get attached to the transaction
940          * while the lock was dropped...
941          */
942         if (commit_transaction->t_forget) {
943                 spin_unlock(&journal->j_list_lock);
944                 write_unlock(&journal->j_state_lock);
945                 goto restart_loop;
946         }
947
948         /* Done with this transaction! */
949
950         jbd_debug(3, "JBD: commit phase 7\n");
951
952         J_ASSERT(commit_transaction->t_state == T_COMMIT);
953
954         commit_transaction->t_start = jiffies;
955         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
956                                               commit_transaction->t_start);
957
958         /*
959          * File the transaction statistics
960          */
961         stats.ts_tid = commit_transaction->t_tid;
962         stats.run.rs_handle_count =
963                 atomic_read(&commit_transaction->t_handle_count);
964         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
965                              commit_transaction->t_tid, &stats.run);
966
967         /*
968          * Calculate overall stats
969          */
970         spin_lock(&journal->j_history_lock);
971         journal->j_stats.ts_tid++;
972         journal->j_stats.run.rs_wait += stats.run.rs_wait;
973         journal->j_stats.run.rs_running += stats.run.rs_running;
974         journal->j_stats.run.rs_locked += stats.run.rs_locked;
975         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
976         journal->j_stats.run.rs_logging += stats.run.rs_logging;
977         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
978         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
979         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
980         spin_unlock(&journal->j_history_lock);
981
982         commit_transaction->t_state = T_FINISHED;
983         J_ASSERT(commit_transaction == journal->j_committing_transaction);
984         journal->j_commit_sequence = commit_transaction->t_tid;
985         journal->j_committing_transaction = NULL;
986         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
987
988         /*
989          * weight the commit time higher than the average time so we don't
990          * react too strongly to vast changes in the commit time
991          */
992         if (likely(journal->j_average_commit_time))
993                 journal->j_average_commit_time = (commit_time +
994                                 journal->j_average_commit_time*3) / 4;
995         else
996                 journal->j_average_commit_time = commit_time;
997         write_unlock(&journal->j_state_lock);
998
999         if (commit_transaction->t_checkpoint_list == NULL &&
1000             commit_transaction->t_checkpoint_io_list == NULL) {
1001                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1002                 to_free = 1;
1003         } else {
1004                 if (journal->j_checkpoint_transactions == NULL) {
1005                         journal->j_checkpoint_transactions = commit_transaction;
1006                         commit_transaction->t_cpnext = commit_transaction;
1007                         commit_transaction->t_cpprev = commit_transaction;
1008                 } else {
1009                         commit_transaction->t_cpnext =
1010                                 journal->j_checkpoint_transactions;
1011                         commit_transaction->t_cpprev =
1012                                 commit_transaction->t_cpnext->t_cpprev;
1013                         commit_transaction->t_cpnext->t_cpprev =
1014                                 commit_transaction;
1015                         commit_transaction->t_cpprev->t_cpnext =
1016                                 commit_transaction;
1017                 }
1018         }
1019         spin_unlock(&journal->j_list_lock);
1020
1021         if (journal->j_commit_callback)
1022                 journal->j_commit_callback(journal, commit_transaction);
1023
1024         trace_jbd2_end_commit(journal, commit_transaction);
1025         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1026                   journal->j_commit_sequence, journal->j_tail_sequence);
1027         if (to_free)
1028                 kfree(commit_transaction);
1029
1030         wake_up(&journal->j_wait_done_commit);
1031 }