6494c81e3b0a9cb3d07d2eec52d585e6e3a46117
[pandora-kernel.git] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 #include <asm/system.h>
32
33 /*
34  * Default IO end handler for temporary BJ_IO buffer_heads.
35  */
36 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37 {
38         BUFFER_TRACE(bh, "");
39         if (uptodate)
40                 set_buffer_uptodate(bh);
41         else
42                 clear_buffer_uptodate(bh);
43         unlock_buffer(bh);
44 }
45
46 /*
47  * When an ext4 file is truncated, it is possible that some pages are not
48  * successfully freed, because they are attached to a committing transaction.
49  * After the transaction commits, these pages are left on the LRU, with no
50  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
51  * by the VM, but their apparent absence upsets the VM accounting, and it makes
52  * the numbers in /proc/meminfo look odd.
53  *
54  * So here, we have a buffer which has just come off the forget list.  Look to
55  * see if we can strip all buffers from the backing page.
56  *
57  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
58  * caller provided us with a ref against the buffer, and we drop that here.
59  */
60 static void release_buffer_page(struct buffer_head *bh)
61 {
62         struct page *page;
63
64         if (buffer_dirty(bh))
65                 goto nope;
66         if (atomic_read(&bh->b_count) != 1)
67                 goto nope;
68         page = bh->b_page;
69         if (!page)
70                 goto nope;
71         if (page->mapping)
72                 goto nope;
73
74         /* OK, it's a truncated page */
75         if (!trylock_page(page))
76                 goto nope;
77
78         page_cache_get(page);
79         __brelse(bh);
80         try_to_free_buffers(page);
81         unlock_page(page);
82         page_cache_release(page);
83         return;
84
85 nope:
86         __brelse(bh);
87 }
88
89 /*
90  * Done it all: now submit the commit record.  We should have
91  * cleaned up our previous buffers by now, so if we are in abort
92  * mode we can now just skip the rest of the journal write
93  * entirely.
94  *
95  * Returns 1 if the journal needs to be aborted or 0 on success
96  */
97 static int journal_submit_commit_record(journal_t *journal,
98                                         transaction_t *commit_transaction,
99                                         struct buffer_head **cbh,
100                                         __u32 crc32_sum)
101 {
102         struct journal_head *descriptor;
103         struct commit_header *tmp;
104         struct buffer_head *bh;
105         int ret;
106         struct timespec now = current_kernel_time();
107
108         if (is_journal_aborted(journal))
109                 return 0;
110
111         descriptor = jbd2_journal_get_descriptor_buffer(journal);
112         if (!descriptor)
113                 return 1;
114
115         bh = jh2bh(descriptor);
116
117         tmp = (struct commit_header *)bh->b_data;
118         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
119         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
120         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
121         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
122         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
123
124         if (JBD2_HAS_COMPAT_FEATURE(journal,
125                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
126                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
127                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
128                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
129         }
130
131         JBUFFER_TRACE(descriptor, "submit commit block");
132         lock_buffer(bh);
133         clear_buffer_dirty(bh);
134         set_buffer_uptodate(bh);
135         bh->b_end_io = journal_end_buffer_io_sync;
136
137         if (journal->j_flags & JBD2_BARRIER &&
138             !JBD2_HAS_INCOMPAT_FEATURE(journal,
139                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
140                 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
141                 if (ret == -EOPNOTSUPP) {
142                         printk(KERN_WARNING
143                                "JBD2: Disabling barriers on %s, "
144                                "not supported by device\n", journal->j_devname);
145                         write_lock(&journal->j_state_lock);
146                         journal->j_flags &= ~JBD2_BARRIER;
147                         write_unlock(&journal->j_state_lock);
148
149                         /* And try again, without the barrier */
150                         lock_buffer(bh);
151                         set_buffer_uptodate(bh);
152                         clear_buffer_dirty(bh);
153                         ret = submit_bh(WRITE_SYNC_PLUG, bh);
154                 }
155         } else {
156                 ret = submit_bh(WRITE_SYNC_PLUG, bh);
157         }
158         *cbh = bh;
159         return ret;
160 }
161
162 /*
163  * This function along with journal_submit_commit_record
164  * allows to write the commit record asynchronously.
165  */
166 static int journal_wait_on_commit_record(journal_t *journal,
167                                          struct buffer_head *bh)
168 {
169         int ret = 0;
170
171 retry:
172         clear_buffer_dirty(bh);
173         wait_on_buffer(bh);
174         if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
175                 printk(KERN_WARNING
176                        "JBD2: %s: disabling barries on %s - not supported "
177                        "by device\n", __func__, journal->j_devname);
178                 write_lock(&journal->j_state_lock);
179                 journal->j_flags &= ~JBD2_BARRIER;
180                 write_unlock(&journal->j_state_lock);
181
182                 lock_buffer(bh);
183                 clear_buffer_dirty(bh);
184                 set_buffer_uptodate(bh);
185                 bh->b_end_io = journal_end_buffer_io_sync;
186
187                 ret = submit_bh(WRITE_SYNC_PLUG, bh);
188                 if (ret) {
189                         unlock_buffer(bh);
190                         return ret;
191                 }
192                 goto retry;
193         }
194
195         if (unlikely(!buffer_uptodate(bh)))
196                 ret = -EIO;
197         put_bh(bh);            /* One for getblk() */
198         jbd2_journal_put_journal_head(bh2jh(bh));
199
200         return ret;
201 }
202
203 /*
204  * write the filemap data using writepage() address_space_operations.
205  * We don't do block allocation here even for delalloc. We don't
206  * use writepages() because with dealyed allocation we may be doing
207  * block allocation in writepages().
208  */
209 static int journal_submit_inode_data_buffers(struct address_space *mapping)
210 {
211         int ret;
212         struct writeback_control wbc = {
213                 .sync_mode =  WB_SYNC_ALL,
214                 .nr_to_write = mapping->nrpages * 2,
215                 .range_start = 0,
216                 .range_end = i_size_read(mapping->host),
217         };
218
219         ret = generic_writepages(mapping, &wbc);
220         return ret;
221 }
222
223 /*
224  * Submit all the data buffers of inode associated with the transaction to
225  * disk.
226  *
227  * We are in a committing transaction. Therefore no new inode can be added to
228  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
229  * operate on from being released while we write out pages.
230  */
231 static int journal_submit_data_buffers(journal_t *journal,
232                 transaction_t *commit_transaction)
233 {
234         struct jbd2_inode *jinode;
235         int err, ret = 0;
236         struct address_space *mapping;
237
238         spin_lock(&journal->j_list_lock);
239         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
240                 mapping = jinode->i_vfs_inode->i_mapping;
241                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
242                 spin_unlock(&journal->j_list_lock);
243                 /*
244                  * submit the inode data buffers. We use writepage
245                  * instead of writepages. Because writepages can do
246                  * block allocation  with delalloc. We need to write
247                  * only allocated blocks here.
248                  */
249                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
250                 err = journal_submit_inode_data_buffers(mapping);
251                 if (!ret)
252                         ret = err;
253                 spin_lock(&journal->j_list_lock);
254                 J_ASSERT(jinode->i_transaction == commit_transaction);
255                 commit_transaction->t_flushed_data_blocks = 1;
256                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
257                 smp_mb__after_clear_bit();
258                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
259         }
260         spin_unlock(&journal->j_list_lock);
261         return ret;
262 }
263
264 /*
265  * Wait for data submitted for writeout, refile inodes to proper
266  * transaction if needed.
267  *
268  */
269 static int journal_finish_inode_data_buffers(journal_t *journal,
270                 transaction_t *commit_transaction)
271 {
272         struct jbd2_inode *jinode, *next_i;
273         int err, ret = 0;
274
275         /* For locking, see the comment in journal_submit_data_buffers() */
276         spin_lock(&journal->j_list_lock);
277         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
278                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
279                 spin_unlock(&journal->j_list_lock);
280                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
281                 if (err) {
282                         /*
283                          * Because AS_EIO is cleared by
284                          * filemap_fdatawait_range(), set it again so
285                          * that user process can get -EIO from fsync().
286                          */
287                         set_bit(AS_EIO,
288                                 &jinode->i_vfs_inode->i_mapping->flags);
289
290                         if (!ret)
291                                 ret = err;
292                 }
293                 spin_lock(&journal->j_list_lock);
294                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
295                 smp_mb__after_clear_bit();
296                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
297         }
298
299         /* Now refile inode to proper lists */
300         list_for_each_entry_safe(jinode, next_i,
301                                  &commit_transaction->t_inode_list, i_list) {
302                 list_del(&jinode->i_list);
303                 if (jinode->i_next_transaction) {
304                         jinode->i_transaction = jinode->i_next_transaction;
305                         jinode->i_next_transaction = NULL;
306                         list_add(&jinode->i_list,
307                                 &jinode->i_transaction->t_inode_list);
308                 } else {
309                         jinode->i_transaction = NULL;
310                 }
311         }
312         spin_unlock(&journal->j_list_lock);
313
314         return ret;
315 }
316
317 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
318 {
319         struct page *page = bh->b_page;
320         char *addr;
321         __u32 checksum;
322
323         addr = kmap_atomic(page, KM_USER0);
324         checksum = crc32_be(crc32_sum,
325                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
326         kunmap_atomic(addr, KM_USER0);
327
328         return checksum;
329 }
330
331 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
332                                    unsigned long long block)
333 {
334         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
335         if (tag_bytes > JBD2_TAG_SIZE32)
336                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
337 }
338
339 /*
340  * jbd2_journal_commit_transaction
341  *
342  * The primary function for committing a transaction to the log.  This
343  * function is called by the journal thread to begin a complete commit.
344  */
345 void jbd2_journal_commit_transaction(journal_t *journal)
346 {
347         struct transaction_stats_s stats;
348         transaction_t *commit_transaction;
349         struct journal_head *jh, *new_jh, *descriptor;
350         struct buffer_head **wbuf = journal->j_wbuf;
351         int bufs;
352         int flags;
353         int err;
354         unsigned long long blocknr;
355         ktime_t start_time;
356         u64 commit_time;
357         char *tagp = NULL;
358         journal_header_t *header;
359         journal_block_tag_t *tag = NULL;
360         int space_left = 0;
361         int first_tag = 0;
362         int tag_flag;
363         int i, to_free = 0;
364         int tag_bytes = journal_tag_bytes(journal);
365         struct buffer_head *cbh = NULL; /* For transactional checksums */
366         __u32 crc32_sum = ~0;
367         int write_op = WRITE;
368
369         /*
370          * First job: lock down the current transaction and wait for
371          * all outstanding updates to complete.
372          */
373
374 #ifdef COMMIT_STATS
375         spin_lock(&journal->j_list_lock);
376         summarise_journal_usage(journal);
377         spin_unlock(&journal->j_list_lock);
378 #endif
379
380         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
381         if (journal->j_flags & JBD2_FLUSHED) {
382                 jbd_debug(3, "super block updated\n");
383                 jbd2_journal_update_superblock(journal, 1);
384         } else {
385                 jbd_debug(3, "superblock not updated\n");
386         }
387
388         J_ASSERT(journal->j_running_transaction != NULL);
389         J_ASSERT(journal->j_committing_transaction == NULL);
390
391         commit_transaction = journal->j_running_transaction;
392         J_ASSERT(commit_transaction->t_state == T_RUNNING);
393
394         trace_jbd2_start_commit(journal, commit_transaction);
395         jbd_debug(1, "JBD: starting commit of transaction %d\n",
396                         commit_transaction->t_tid);
397
398         write_lock(&journal->j_state_lock);
399         commit_transaction->t_state = T_LOCKED;
400
401         /*
402          * Use plugged writes here, since we want to submit several before
403          * we unplug the device. We don't do explicit unplugging in here,
404          * instead we rely on sync_buffer() doing the unplug for us.
405          */
406         if (commit_transaction->t_synchronous_commit)
407                 write_op = WRITE_SYNC_PLUG;
408         trace_jbd2_commit_locking(journal, commit_transaction);
409         stats.run.rs_wait = commit_transaction->t_max_wait;
410         stats.run.rs_locked = jiffies;
411         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
412                                               stats.run.rs_locked);
413
414         spin_lock(&commit_transaction->t_handle_lock);
415         while (atomic_read(&commit_transaction->t_updates)) {
416                 DEFINE_WAIT(wait);
417
418                 prepare_to_wait(&journal->j_wait_updates, &wait,
419                                         TASK_UNINTERRUPTIBLE);
420                 if (atomic_read(&commit_transaction->t_updates)) {
421                         spin_unlock(&commit_transaction->t_handle_lock);
422                         write_unlock(&journal->j_state_lock);
423                         schedule();
424                         write_lock(&journal->j_state_lock);
425                         spin_lock(&commit_transaction->t_handle_lock);
426                 }
427                 finish_wait(&journal->j_wait_updates, &wait);
428         }
429         spin_unlock(&commit_transaction->t_handle_lock);
430
431         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
432                         journal->j_max_transaction_buffers);
433
434         /*
435          * First thing we are allowed to do is to discard any remaining
436          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
437          * that there are no such buffers: if a large filesystem
438          * operation like a truncate needs to split itself over multiple
439          * transactions, then it may try to do a jbd2_journal_restart() while
440          * there are still BJ_Reserved buffers outstanding.  These must
441          * be released cleanly from the current transaction.
442          *
443          * In this case, the filesystem must still reserve write access
444          * again before modifying the buffer in the new transaction, but
445          * we do not require it to remember exactly which old buffers it
446          * has reserved.  This is consistent with the existing behaviour
447          * that multiple jbd2_journal_get_write_access() calls to the same
448          * buffer are perfectly permissable.
449          */
450         while (commit_transaction->t_reserved_list) {
451                 jh = commit_transaction->t_reserved_list;
452                 JBUFFER_TRACE(jh, "reserved, unused: refile");
453                 /*
454                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
455                  * leave undo-committed data.
456                  */
457                 if (jh->b_committed_data) {
458                         struct buffer_head *bh = jh2bh(jh);
459
460                         jbd_lock_bh_state(bh);
461                         jbd2_free(jh->b_committed_data, bh->b_size);
462                         jh->b_committed_data = NULL;
463                         jbd_unlock_bh_state(bh);
464                 }
465                 jbd2_journal_refile_buffer(journal, jh);
466         }
467
468         /*
469          * Now try to drop any written-back buffers from the journal's
470          * checkpoint lists.  We do this *before* commit because it potentially
471          * frees some memory
472          */
473         spin_lock(&journal->j_list_lock);
474         __jbd2_journal_clean_checkpoint_list(journal);
475         spin_unlock(&journal->j_list_lock);
476
477         jbd_debug (3, "JBD: commit phase 1\n");
478
479         /*
480          * Switch to a new revoke table.
481          */
482         jbd2_journal_switch_revoke_table(journal);
483
484         trace_jbd2_commit_flushing(journal, commit_transaction);
485         stats.run.rs_flushing = jiffies;
486         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
487                                              stats.run.rs_flushing);
488
489         commit_transaction->t_state = T_FLUSH;
490         journal->j_committing_transaction = commit_transaction;
491         journal->j_running_transaction = NULL;
492         start_time = ktime_get();
493         commit_transaction->t_log_start = journal->j_head;
494         wake_up(&journal->j_wait_transaction_locked);
495         write_unlock(&journal->j_state_lock);
496
497         jbd_debug (3, "JBD: commit phase 2\n");
498
499         /*
500          * Now start flushing things to disk, in the order they appear
501          * on the transaction lists.  Data blocks go first.
502          */
503         err = journal_submit_data_buffers(journal, commit_transaction);
504         if (err)
505                 jbd2_journal_abort(journal, err);
506
507         jbd2_journal_write_revoke_records(journal, commit_transaction,
508                                           write_op);
509
510         jbd_debug(3, "JBD: commit phase 2\n");
511
512         /*
513          * Way to go: we have now written out all of the data for a
514          * transaction!  Now comes the tricky part: we need to write out
515          * metadata.  Loop over the transaction's entire buffer list:
516          */
517         write_lock(&journal->j_state_lock);
518         commit_transaction->t_state = T_COMMIT;
519         write_unlock(&journal->j_state_lock);
520
521         trace_jbd2_commit_logging(journal, commit_transaction);
522         stats.run.rs_logging = jiffies;
523         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
524                                                stats.run.rs_logging);
525         stats.run.rs_blocks =
526                 atomic_read(&commit_transaction->t_outstanding_credits);
527         stats.run.rs_blocks_logged = 0;
528
529         J_ASSERT(commit_transaction->t_nr_buffers <=
530                  atomic_read(&commit_transaction->t_outstanding_credits));
531
532         err = 0;
533         descriptor = NULL;
534         bufs = 0;
535         while (commit_transaction->t_buffers) {
536
537                 /* Find the next buffer to be journaled... */
538
539                 jh = commit_transaction->t_buffers;
540
541                 /* If we're in abort mode, we just un-journal the buffer and
542                    release it. */
543
544                 if (is_journal_aborted(journal)) {
545                         clear_buffer_jbddirty(jh2bh(jh));
546                         JBUFFER_TRACE(jh, "journal is aborting: refile");
547                         jbd2_buffer_abort_trigger(jh,
548                                                   jh->b_frozen_data ?
549                                                   jh->b_frozen_triggers :
550                                                   jh->b_triggers);
551                         jbd2_journal_refile_buffer(journal, jh);
552                         /* If that was the last one, we need to clean up
553                          * any descriptor buffers which may have been
554                          * already allocated, even if we are now
555                          * aborting. */
556                         if (!commit_transaction->t_buffers)
557                                 goto start_journal_io;
558                         continue;
559                 }
560
561                 /* Make sure we have a descriptor block in which to
562                    record the metadata buffer. */
563
564                 if (!descriptor) {
565                         struct buffer_head *bh;
566
567                         J_ASSERT (bufs == 0);
568
569                         jbd_debug(4, "JBD: get descriptor\n");
570
571                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
572                         if (!descriptor) {
573                                 jbd2_journal_abort(journal, -EIO);
574                                 continue;
575                         }
576
577                         bh = jh2bh(descriptor);
578                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
579                                 (unsigned long long)bh->b_blocknr, bh->b_data);
580                         header = (journal_header_t *)&bh->b_data[0];
581                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
582                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
583                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
584
585                         tagp = &bh->b_data[sizeof(journal_header_t)];
586                         space_left = bh->b_size - sizeof(journal_header_t);
587                         first_tag = 1;
588                         set_buffer_jwrite(bh);
589                         set_buffer_dirty(bh);
590                         wbuf[bufs++] = bh;
591
592                         /* Record it so that we can wait for IO
593                            completion later */
594                         BUFFER_TRACE(bh, "ph3: file as descriptor");
595                         jbd2_journal_file_buffer(descriptor, commit_transaction,
596                                         BJ_LogCtl);
597                 }
598
599                 /* Where is the buffer to be written? */
600
601                 err = jbd2_journal_next_log_block(journal, &blocknr);
602                 /* If the block mapping failed, just abandon the buffer
603                    and repeat this loop: we'll fall into the
604                    refile-on-abort condition above. */
605                 if (err) {
606                         jbd2_journal_abort(journal, err);
607                         continue;
608                 }
609
610                 /*
611                  * start_this_handle() uses t_outstanding_credits to determine
612                  * the free space in the log, but this counter is changed
613                  * by jbd2_journal_next_log_block() also.
614                  */
615                 atomic_dec(&commit_transaction->t_outstanding_credits);
616
617                 /* Bump b_count to prevent truncate from stumbling over
618                    the shadowed buffer!  @@@ This can go if we ever get
619                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
620                 atomic_inc(&jh2bh(jh)->b_count);
621
622                 /* Make a temporary IO buffer with which to write it out
623                    (this will requeue both the metadata buffer and the
624                    temporary IO buffer). new_bh goes on BJ_IO*/
625
626                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
627                 /*
628                  * akpm: jbd2_journal_write_metadata_buffer() sets
629                  * new_bh->b_transaction to commit_transaction.
630                  * We need to clean this up before we release new_bh
631                  * (which is of type BJ_IO)
632                  */
633                 JBUFFER_TRACE(jh, "ph3: write metadata");
634                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
635                                                       jh, &new_jh, blocknr);
636                 if (flags < 0) {
637                         jbd2_journal_abort(journal, flags);
638                         continue;
639                 }
640                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
641                 wbuf[bufs++] = jh2bh(new_jh);
642
643                 /* Record the new block's tag in the current descriptor
644                    buffer */
645
646                 tag_flag = 0;
647                 if (flags & 1)
648                         tag_flag |= JBD2_FLAG_ESCAPE;
649                 if (!first_tag)
650                         tag_flag |= JBD2_FLAG_SAME_UUID;
651
652                 tag = (journal_block_tag_t *) tagp;
653                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
654                 tag->t_flags = cpu_to_be32(tag_flag);
655                 tagp += tag_bytes;
656                 space_left -= tag_bytes;
657
658                 if (first_tag) {
659                         memcpy (tagp, journal->j_uuid, 16);
660                         tagp += 16;
661                         space_left -= 16;
662                         first_tag = 0;
663                 }
664
665                 /* If there's no more to do, or if the descriptor is full,
666                    let the IO rip! */
667
668                 if (bufs == journal->j_wbufsize ||
669                     commit_transaction->t_buffers == NULL ||
670                     space_left < tag_bytes + 16) {
671
672                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
673
674                         /* Write an end-of-descriptor marker before
675                            submitting the IOs.  "tag" still points to
676                            the last tag we set up. */
677
678                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
679
680 start_journal_io:
681                         for (i = 0; i < bufs; i++) {
682                                 struct buffer_head *bh = wbuf[i];
683                                 /*
684                                  * Compute checksum.
685                                  */
686                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
687                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
688                                         crc32_sum =
689                                             jbd2_checksum_data(crc32_sum, bh);
690                                 }
691
692                                 lock_buffer(bh);
693                                 clear_buffer_dirty(bh);
694                                 set_buffer_uptodate(bh);
695                                 bh->b_end_io = journal_end_buffer_io_sync;
696                                 submit_bh(write_op, bh);
697                         }
698                         cond_resched();
699                         stats.run.rs_blocks_logged += bufs;
700
701                         /* Force a new descriptor to be generated next
702                            time round the loop. */
703                         descriptor = NULL;
704                         bufs = 0;
705                 }
706         }
707
708         /* 
709          * If the journal is not located on the file system device,
710          * then we must flush the file system device before we issue
711          * the commit record
712          */
713         if (commit_transaction->t_flushed_data_blocks &&
714             (journal->j_fs_dev != journal->j_dev) &&
715             (journal->j_flags & JBD2_BARRIER))
716                 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
717                         BLKDEV_IFL_WAIT);
718
719         /* Done it all: now write the commit record asynchronously. */
720         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
721                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
722                 err = journal_submit_commit_record(journal, commit_transaction,
723                                                  &cbh, crc32_sum);
724                 if (err)
725                         __jbd2_journal_abort_hard(journal);
726                 if (journal->j_flags & JBD2_BARRIER)
727                         blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
728                                 BLKDEV_IFL_WAIT);
729         }
730
731         err = journal_finish_inode_data_buffers(journal, commit_transaction);
732         if (err) {
733                 printk(KERN_WARNING
734                         "JBD2: Detected IO errors while flushing file data "
735                        "on %s\n", journal->j_devname);
736                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
737                         jbd2_journal_abort(journal, err);
738                 err = 0;
739         }
740
741         /* Lo and behold: we have just managed to send a transaction to
742            the log.  Before we can commit it, wait for the IO so far to
743            complete.  Control buffers being written are on the
744            transaction's t_log_list queue, and metadata buffers are on
745            the t_iobuf_list queue.
746
747            Wait for the buffers in reverse order.  That way we are
748            less likely to be woken up until all IOs have completed, and
749            so we incur less scheduling load.
750         */
751
752         jbd_debug(3, "JBD: commit phase 3\n");
753
754         /*
755          * akpm: these are BJ_IO, and j_list_lock is not needed.
756          * See __journal_try_to_free_buffer.
757          */
758 wait_for_iobuf:
759         while (commit_transaction->t_iobuf_list != NULL) {
760                 struct buffer_head *bh;
761
762                 jh = commit_transaction->t_iobuf_list->b_tprev;
763                 bh = jh2bh(jh);
764                 if (buffer_locked(bh)) {
765                         wait_on_buffer(bh);
766                         goto wait_for_iobuf;
767                 }
768                 if (cond_resched())
769                         goto wait_for_iobuf;
770
771                 if (unlikely(!buffer_uptodate(bh)))
772                         err = -EIO;
773
774                 clear_buffer_jwrite(bh);
775
776                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
777                 jbd2_journal_unfile_buffer(journal, jh);
778
779                 /*
780                  * ->t_iobuf_list should contain only dummy buffer_heads
781                  * which were created by jbd2_journal_write_metadata_buffer().
782                  */
783                 BUFFER_TRACE(bh, "dumping temporary bh");
784                 jbd2_journal_put_journal_head(jh);
785                 __brelse(bh);
786                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
787                 free_buffer_head(bh);
788
789                 /* We also have to unlock and free the corresponding
790                    shadowed buffer */
791                 jh = commit_transaction->t_shadow_list->b_tprev;
792                 bh = jh2bh(jh);
793                 clear_bit(BH_JWrite, &bh->b_state);
794                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
795
796                 /* The metadata is now released for reuse, but we need
797                    to remember it against this transaction so that when
798                    we finally commit, we can do any checkpointing
799                    required. */
800                 JBUFFER_TRACE(jh, "file as BJ_Forget");
801                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
802                 /* Wake up any transactions which were waiting for this
803                    IO to complete */
804                 wake_up_bit(&bh->b_state, BH_Unshadow);
805                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
806                 __brelse(bh);
807         }
808
809         J_ASSERT (commit_transaction->t_shadow_list == NULL);
810
811         jbd_debug(3, "JBD: commit phase 4\n");
812
813         /* Here we wait for the revoke record and descriptor record buffers */
814  wait_for_ctlbuf:
815         while (commit_transaction->t_log_list != NULL) {
816                 struct buffer_head *bh;
817
818                 jh = commit_transaction->t_log_list->b_tprev;
819                 bh = jh2bh(jh);
820                 if (buffer_locked(bh)) {
821                         wait_on_buffer(bh);
822                         goto wait_for_ctlbuf;
823                 }
824                 if (cond_resched())
825                         goto wait_for_ctlbuf;
826
827                 if (unlikely(!buffer_uptodate(bh)))
828                         err = -EIO;
829
830                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
831                 clear_buffer_jwrite(bh);
832                 jbd2_journal_unfile_buffer(journal, jh);
833                 jbd2_journal_put_journal_head(jh);
834                 __brelse(bh);           /* One for getblk */
835                 /* AKPM: bforget here */
836         }
837
838         if (err)
839                 jbd2_journal_abort(journal, err);
840
841         jbd_debug(3, "JBD: commit phase 5\n");
842
843         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
844                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
845                 err = journal_submit_commit_record(journal, commit_transaction,
846                                                 &cbh, crc32_sum);
847                 if (err)
848                         __jbd2_journal_abort_hard(journal);
849         }
850         if (!err && !is_journal_aborted(journal))
851                 err = journal_wait_on_commit_record(journal, cbh);
852
853         if (err)
854                 jbd2_journal_abort(journal, err);
855
856         /* End of a transaction!  Finally, we can do checkpoint
857            processing: any buffers committed as a result of this
858            transaction can be removed from any checkpoint list it was on
859            before. */
860
861         jbd_debug(3, "JBD: commit phase 6\n");
862
863         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
864         J_ASSERT(commit_transaction->t_buffers == NULL);
865         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
866         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
867         J_ASSERT(commit_transaction->t_shadow_list == NULL);
868         J_ASSERT(commit_transaction->t_log_list == NULL);
869
870 restart_loop:
871         /*
872          * As there are other places (journal_unmap_buffer()) adding buffers
873          * to this list we have to be careful and hold the j_list_lock.
874          */
875         spin_lock(&journal->j_list_lock);
876         while (commit_transaction->t_forget) {
877                 transaction_t *cp_transaction;
878                 struct buffer_head *bh;
879
880                 jh = commit_transaction->t_forget;
881                 spin_unlock(&journal->j_list_lock);
882                 bh = jh2bh(jh);
883                 jbd_lock_bh_state(bh);
884                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
885
886                 /*
887                  * If there is undo-protected committed data against
888                  * this buffer, then we can remove it now.  If it is a
889                  * buffer needing such protection, the old frozen_data
890                  * field now points to a committed version of the
891                  * buffer, so rotate that field to the new committed
892                  * data.
893                  *
894                  * Otherwise, we can just throw away the frozen data now.
895                  *
896                  * We also know that the frozen data has already fired
897                  * its triggers if they exist, so we can clear that too.
898                  */
899                 if (jh->b_committed_data) {
900                         jbd2_free(jh->b_committed_data, bh->b_size);
901                         jh->b_committed_data = NULL;
902                         if (jh->b_frozen_data) {
903                                 jh->b_committed_data = jh->b_frozen_data;
904                                 jh->b_frozen_data = NULL;
905                                 jh->b_frozen_triggers = NULL;
906                         }
907                 } else if (jh->b_frozen_data) {
908                         jbd2_free(jh->b_frozen_data, bh->b_size);
909                         jh->b_frozen_data = NULL;
910                         jh->b_frozen_triggers = NULL;
911                 }
912
913                 spin_lock(&journal->j_list_lock);
914                 cp_transaction = jh->b_cp_transaction;
915                 if (cp_transaction) {
916                         JBUFFER_TRACE(jh, "remove from old cp transaction");
917                         cp_transaction->t_chp_stats.cs_dropped++;
918                         __jbd2_journal_remove_checkpoint(jh);
919                 }
920
921                 /* Only re-checkpoint the buffer_head if it is marked
922                  * dirty.  If the buffer was added to the BJ_Forget list
923                  * by jbd2_journal_forget, it may no longer be dirty and
924                  * there's no point in keeping a checkpoint record for
925                  * it. */
926
927                 /* A buffer which has been freed while still being
928                  * journaled by a previous transaction may end up still
929                  * being dirty here, but we want to avoid writing back
930                  * that buffer in the future after the "add to orphan"
931                  * operation been committed,  That's not only a performance
932                  * gain, it also stops aliasing problems if the buffer is
933                  * left behind for writeback and gets reallocated for another
934                  * use in a different page. */
935                 if (buffer_freed(bh) && !jh->b_next_transaction) {
936                         clear_buffer_freed(bh);
937                         clear_buffer_jbddirty(bh);
938                 }
939
940                 if (buffer_jbddirty(bh)) {
941                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
942                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
943                         if (is_journal_aborted(journal))
944                                 clear_buffer_jbddirty(bh);
945                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
946                         __jbd2_journal_refile_buffer(jh);
947                         jbd_unlock_bh_state(bh);
948                 } else {
949                         J_ASSERT_BH(bh, !buffer_dirty(bh));
950                         /* The buffer on BJ_Forget list and not jbddirty means
951                          * it has been freed by this transaction and hence it
952                          * could not have been reallocated until this
953                          * transaction has committed. *BUT* it could be
954                          * reallocated once we have written all the data to
955                          * disk and before we process the buffer on BJ_Forget
956                          * list. */
957                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
958                         __jbd2_journal_refile_buffer(jh);
959                         if (!jh->b_transaction) {
960                                 jbd_unlock_bh_state(bh);
961                                  /* needs a brelse */
962                                 jbd2_journal_remove_journal_head(bh);
963                                 release_buffer_page(bh);
964                         } else
965                                 jbd_unlock_bh_state(bh);
966                 }
967                 cond_resched_lock(&journal->j_list_lock);
968         }
969         spin_unlock(&journal->j_list_lock);
970         /*
971          * This is a bit sleazy.  We use j_list_lock to protect transition
972          * of a transaction into T_FINISHED state and calling
973          * __jbd2_journal_drop_transaction(). Otherwise we could race with
974          * other checkpointing code processing the transaction...
975          */
976         write_lock(&journal->j_state_lock);
977         spin_lock(&journal->j_list_lock);
978         /*
979          * Now recheck if some buffers did not get attached to the transaction
980          * while the lock was dropped...
981          */
982         if (commit_transaction->t_forget) {
983                 spin_unlock(&journal->j_list_lock);
984                 write_unlock(&journal->j_state_lock);
985                 goto restart_loop;
986         }
987
988         /* Done with this transaction! */
989
990         jbd_debug(3, "JBD: commit phase 7\n");
991
992         J_ASSERT(commit_transaction->t_state == T_COMMIT);
993
994         commit_transaction->t_start = jiffies;
995         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
996                                               commit_transaction->t_start);
997
998         /*
999          * File the transaction statistics
1000          */
1001         stats.ts_tid = commit_transaction->t_tid;
1002         stats.run.rs_handle_count =
1003                 atomic_read(&commit_transaction->t_handle_count);
1004         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1005                              commit_transaction->t_tid, &stats.run);
1006
1007         /*
1008          * Calculate overall stats
1009          */
1010         spin_lock(&journal->j_history_lock);
1011         journal->j_stats.ts_tid++;
1012         journal->j_stats.run.rs_wait += stats.run.rs_wait;
1013         journal->j_stats.run.rs_running += stats.run.rs_running;
1014         journal->j_stats.run.rs_locked += stats.run.rs_locked;
1015         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1016         journal->j_stats.run.rs_logging += stats.run.rs_logging;
1017         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1018         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1019         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1020         spin_unlock(&journal->j_history_lock);
1021
1022         commit_transaction->t_state = T_FINISHED;
1023         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1024         journal->j_commit_sequence = commit_transaction->t_tid;
1025         journal->j_committing_transaction = NULL;
1026         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1027
1028         /*
1029          * weight the commit time higher than the average time so we don't
1030          * react too strongly to vast changes in the commit time
1031          */
1032         if (likely(journal->j_average_commit_time))
1033                 journal->j_average_commit_time = (commit_time +
1034                                 journal->j_average_commit_time*3) / 4;
1035         else
1036                 journal->j_average_commit_time = commit_time;
1037         write_unlock(&journal->j_state_lock);
1038
1039         if (commit_transaction->t_checkpoint_list == NULL &&
1040             commit_transaction->t_checkpoint_io_list == NULL) {
1041                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1042                 to_free = 1;
1043         } else {
1044                 if (journal->j_checkpoint_transactions == NULL) {
1045                         journal->j_checkpoint_transactions = commit_transaction;
1046                         commit_transaction->t_cpnext = commit_transaction;
1047                         commit_transaction->t_cpprev = commit_transaction;
1048                 } else {
1049                         commit_transaction->t_cpnext =
1050                                 journal->j_checkpoint_transactions;
1051                         commit_transaction->t_cpprev =
1052                                 commit_transaction->t_cpnext->t_cpprev;
1053                         commit_transaction->t_cpnext->t_cpprev =
1054                                 commit_transaction;
1055                         commit_transaction->t_cpprev->t_cpnext =
1056                                 commit_transaction;
1057                 }
1058         }
1059         spin_unlock(&journal->j_list_lock);
1060
1061         if (journal->j_commit_callback)
1062                 journal->j_commit_callback(journal, commit_transaction);
1063
1064         trace_jbd2_end_commit(journal, commit_transaction);
1065         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1066                   journal->j_commit_sequence, journal->j_tail_sequence);
1067         if (to_free)
1068                 kfree(commit_transaction);
1069
1070         wake_up(&journal->j_wait_done_commit);
1071 }