57d4ca7fd5558667779a6ef80cb6f30c4174784c
[pandora-kernel.git] / fs / btrfs / tree-log.c
1 /*
2  * Copyright (C) 2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/blkdev.h>
22 #include <linux/list_sort.h>
23 #include "ctree.h"
24 #include "transaction.h"
25 #include "disk-io.h"
26 #include "locking.h"
27 #include "print-tree.h"
28 #include "backref.h"
29 #include "tree-log.h"
30 #include "hash.h"
31
32 /* magic values for the inode_only field in btrfs_log_inode:
33  *
34  * LOG_INODE_ALL means to log everything
35  * LOG_INODE_EXISTS means to log just enough to recreate the inode
36  * during log replay
37  */
38 #define LOG_INODE_ALL 0
39 #define LOG_INODE_EXISTS 1
40
41 /*
42  * directory trouble cases
43  *
44  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
45  * log, we must force a full commit before doing an fsync of the directory
46  * where the unlink was done.
47  * ---> record transid of last unlink/rename per directory
48  *
49  * mkdir foo/some_dir
50  * normal commit
51  * rename foo/some_dir foo2/some_dir
52  * mkdir foo/some_dir
53  * fsync foo/some_dir/some_file
54  *
55  * The fsync above will unlink the original some_dir without recording
56  * it in its new location (foo2).  After a crash, some_dir will be gone
57  * unless the fsync of some_file forces a full commit
58  *
59  * 2) we must log any new names for any file or dir that is in the fsync
60  * log. ---> check inode while renaming/linking.
61  *
62  * 2a) we must log any new names for any file or dir during rename
63  * when the directory they are being removed from was logged.
64  * ---> check inode and old parent dir during rename
65  *
66  *  2a is actually the more important variant.  With the extra logging
67  *  a crash might unlink the old name without recreating the new one
68  *
69  * 3) after a crash, we must go through any directories with a link count
70  * of zero and redo the rm -rf
71  *
72  * mkdir f1/foo
73  * normal commit
74  * rm -rf f1/foo
75  * fsync(f1)
76  *
77  * The directory f1 was fully removed from the FS, but fsync was never
78  * called on f1, only its parent dir.  After a crash the rm -rf must
79  * be replayed.  This must be able to recurse down the entire
80  * directory tree.  The inode link count fixup code takes care of the
81  * ugly details.
82  */
83
84 /*
85  * stages for the tree walking.  The first
86  * stage (0) is to only pin down the blocks we find
87  * the second stage (1) is to make sure that all the inodes
88  * we find in the log are created in the subvolume.
89  *
90  * The last stage is to deal with directories and links and extents
91  * and all the other fun semantics
92  */
93 #define LOG_WALK_PIN_ONLY 0
94 #define LOG_WALK_REPLAY_INODES 1
95 #define LOG_WALK_REPLAY_DIR_INDEX 2
96 #define LOG_WALK_REPLAY_ALL 3
97
98 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
99                              struct btrfs_root *root, struct inode *inode,
100                              int inode_only);
101 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
102                              struct btrfs_root *root,
103                              struct btrfs_path *path, u64 objectid);
104 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
105                                        struct btrfs_root *root,
106                                        struct btrfs_root *log,
107                                        struct btrfs_path *path,
108                                        u64 dirid, int del_all);
109
110 /*
111  * tree logging is a special write ahead log used to make sure that
112  * fsyncs and O_SYNCs can happen without doing full tree commits.
113  *
114  * Full tree commits are expensive because they require commonly
115  * modified blocks to be recowed, creating many dirty pages in the
116  * extent tree an 4x-6x higher write load than ext3.
117  *
118  * Instead of doing a tree commit on every fsync, we use the
119  * key ranges and transaction ids to find items for a given file or directory
120  * that have changed in this transaction.  Those items are copied into
121  * a special tree (one per subvolume root), that tree is written to disk
122  * and then the fsync is considered complete.
123  *
124  * After a crash, items are copied out of the log-tree back into the
125  * subvolume tree.  Any file data extents found are recorded in the extent
126  * allocation tree, and the log-tree freed.
127  *
128  * The log tree is read three times, once to pin down all the extents it is
129  * using in ram and once, once to create all the inodes logged in the tree
130  * and once to do all the other items.
131  */
132
133 /*
134  * start a sub transaction and setup the log tree
135  * this increments the log tree writer count to make the people
136  * syncing the tree wait for us to finish
137  */
138 static int start_log_trans(struct btrfs_trans_handle *trans,
139                            struct btrfs_root *root,
140                            struct btrfs_log_ctx *ctx)
141 {
142         int index;
143         int ret;
144
145         mutex_lock(&root->log_mutex);
146         if (root->log_root) {
147                 if (!root->log_start_pid) {
148                         root->log_start_pid = current->pid;
149                         root->log_multiple_pids = false;
150                 } else if (root->log_start_pid != current->pid) {
151                         root->log_multiple_pids = true;
152                 }
153
154                 atomic_inc(&root->log_batch);
155                 atomic_inc(&root->log_writers);
156                 if (ctx) {
157                         index = root->log_transid % 2;
158                         list_add_tail(&ctx->list, &root->log_ctxs[index]);
159                         ctx->log_transid = root->log_transid;
160                 }
161                 mutex_unlock(&root->log_mutex);
162                 return 0;
163         }
164
165         ret = 0;
166         mutex_lock(&root->fs_info->tree_log_mutex);
167         if (!root->fs_info->log_root_tree)
168                 ret = btrfs_init_log_root_tree(trans, root->fs_info);
169         mutex_unlock(&root->fs_info->tree_log_mutex);
170         if (ret)
171                 goto out;
172
173         if (!root->log_root) {
174                 ret = btrfs_add_log_tree(trans, root);
175                 if (ret)
176                         goto out;
177         }
178         root->log_multiple_pids = false;
179         root->log_start_pid = current->pid;
180         atomic_inc(&root->log_batch);
181         atomic_inc(&root->log_writers);
182         if (ctx) {
183                 index = root->log_transid % 2;
184                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
185                 ctx->log_transid = root->log_transid;
186         }
187 out:
188         mutex_unlock(&root->log_mutex);
189         return ret;
190 }
191
192 /*
193  * returns 0 if there was a log transaction running and we were able
194  * to join, or returns -ENOENT if there were not transactions
195  * in progress
196  */
197 static int join_running_log_trans(struct btrfs_root *root)
198 {
199         int ret = -ENOENT;
200
201         smp_mb();
202         if (!root->log_root)
203                 return -ENOENT;
204
205         mutex_lock(&root->log_mutex);
206         if (root->log_root) {
207                 ret = 0;
208                 atomic_inc(&root->log_writers);
209         }
210         mutex_unlock(&root->log_mutex);
211         return ret;
212 }
213
214 /*
215  * This either makes the current running log transaction wait
216  * until you call btrfs_end_log_trans() or it makes any future
217  * log transactions wait until you call btrfs_end_log_trans()
218  */
219 int btrfs_pin_log_trans(struct btrfs_root *root)
220 {
221         int ret = -ENOENT;
222
223         mutex_lock(&root->log_mutex);
224         atomic_inc(&root->log_writers);
225         mutex_unlock(&root->log_mutex);
226         return ret;
227 }
228
229 /*
230  * indicate we're done making changes to the log tree
231  * and wake up anyone waiting to do a sync
232  */
233 void btrfs_end_log_trans(struct btrfs_root *root)
234 {
235         if (atomic_dec_and_test(&root->log_writers)) {
236                 smp_mb();
237                 if (waitqueue_active(&root->log_writer_wait))
238                         wake_up(&root->log_writer_wait);
239         }
240 }
241
242
243 /*
244  * the walk control struct is used to pass state down the chain when
245  * processing the log tree.  The stage field tells us which part
246  * of the log tree processing we are currently doing.  The others
247  * are state fields used for that specific part
248  */
249 struct walk_control {
250         /* should we free the extent on disk when done?  This is used
251          * at transaction commit time while freeing a log tree
252          */
253         int free;
254
255         /* should we write out the extent buffer?  This is used
256          * while flushing the log tree to disk during a sync
257          */
258         int write;
259
260         /* should we wait for the extent buffer io to finish?  Also used
261          * while flushing the log tree to disk for a sync
262          */
263         int wait;
264
265         /* pin only walk, we record which extents on disk belong to the
266          * log trees
267          */
268         int pin;
269
270         /* what stage of the replay code we're currently in */
271         int stage;
272
273         /* the root we are currently replaying */
274         struct btrfs_root *replay_dest;
275
276         /* the trans handle for the current replay */
277         struct btrfs_trans_handle *trans;
278
279         /* the function that gets used to process blocks we find in the
280          * tree.  Note the extent_buffer might not be up to date when it is
281          * passed in, and it must be checked or read if you need the data
282          * inside it
283          */
284         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
285                             struct walk_control *wc, u64 gen);
286 };
287
288 /*
289  * process_func used to pin down extents, write them or wait on them
290  */
291 static int process_one_buffer(struct btrfs_root *log,
292                               struct extent_buffer *eb,
293                               struct walk_control *wc, u64 gen)
294 {
295         int ret = 0;
296
297         /*
298          * If this fs is mixed then we need to be able to process the leaves to
299          * pin down any logged extents, so we have to read the block.
300          */
301         if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
302                 ret = btrfs_read_buffer(eb, gen);
303                 if (ret)
304                         return ret;
305         }
306
307         if (wc->pin)
308                 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
309                                                       eb->start, eb->len);
310
311         if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
312                 if (wc->pin && btrfs_header_level(eb) == 0)
313                         ret = btrfs_exclude_logged_extents(log, eb);
314                 if (wc->write)
315                         btrfs_write_tree_block(eb);
316                 if (wc->wait)
317                         btrfs_wait_tree_block_writeback(eb);
318         }
319         return ret;
320 }
321
322 /*
323  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
324  * to the src data we are copying out.
325  *
326  * root is the tree we are copying into, and path is a scratch
327  * path for use in this function (it should be released on entry and
328  * will be released on exit).
329  *
330  * If the key is already in the destination tree the existing item is
331  * overwritten.  If the existing item isn't big enough, it is extended.
332  * If it is too large, it is truncated.
333  *
334  * If the key isn't in the destination yet, a new item is inserted.
335  */
336 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
337                                    struct btrfs_root *root,
338                                    struct btrfs_path *path,
339                                    struct extent_buffer *eb, int slot,
340                                    struct btrfs_key *key)
341 {
342         int ret;
343         u32 item_size;
344         u64 saved_i_size = 0;
345         int save_old_i_size = 0;
346         unsigned long src_ptr;
347         unsigned long dst_ptr;
348         int overwrite_root = 0;
349         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
350
351         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
352                 overwrite_root = 1;
353
354         item_size = btrfs_item_size_nr(eb, slot);
355         src_ptr = btrfs_item_ptr_offset(eb, slot);
356
357         /* look for the key in the destination tree */
358         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
359         if (ret < 0)
360                 return ret;
361
362         if (ret == 0) {
363                 char *src_copy;
364                 char *dst_copy;
365                 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
366                                                   path->slots[0]);
367                 if (dst_size != item_size)
368                         goto insert;
369
370                 if (item_size == 0) {
371                         btrfs_release_path(path);
372                         return 0;
373                 }
374                 dst_copy = kmalloc(item_size, GFP_NOFS);
375                 src_copy = kmalloc(item_size, GFP_NOFS);
376                 if (!dst_copy || !src_copy) {
377                         btrfs_release_path(path);
378                         kfree(dst_copy);
379                         kfree(src_copy);
380                         return -ENOMEM;
381                 }
382
383                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
384
385                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
386                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
387                                    item_size);
388                 ret = memcmp(dst_copy, src_copy, item_size);
389
390                 kfree(dst_copy);
391                 kfree(src_copy);
392                 /*
393                  * they have the same contents, just return, this saves
394                  * us from cowing blocks in the destination tree and doing
395                  * extra writes that may not have been done by a previous
396                  * sync
397                  */
398                 if (ret == 0) {
399                         btrfs_release_path(path);
400                         return 0;
401                 }
402
403                 /*
404                  * We need to load the old nbytes into the inode so when we
405                  * replay the extents we've logged we get the right nbytes.
406                  */
407                 if (inode_item) {
408                         struct btrfs_inode_item *item;
409                         u64 nbytes;
410                         u32 mode;
411
412                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
413                                               struct btrfs_inode_item);
414                         nbytes = btrfs_inode_nbytes(path->nodes[0], item);
415                         item = btrfs_item_ptr(eb, slot,
416                                               struct btrfs_inode_item);
417                         btrfs_set_inode_nbytes(eb, item, nbytes);
418
419                         /*
420                          * If this is a directory we need to reset the i_size to
421                          * 0 so that we can set it up properly when replaying
422                          * the rest of the items in this log.
423                          */
424                         mode = btrfs_inode_mode(eb, item);
425                         if (S_ISDIR(mode))
426                                 btrfs_set_inode_size(eb, item, 0);
427                 }
428         } else if (inode_item) {
429                 struct btrfs_inode_item *item;
430                 u32 mode;
431
432                 /*
433                  * New inode, set nbytes to 0 so that the nbytes comes out
434                  * properly when we replay the extents.
435                  */
436                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
437                 btrfs_set_inode_nbytes(eb, item, 0);
438
439                 /*
440                  * If this is a directory we need to reset the i_size to 0 so
441                  * that we can set it up properly when replaying the rest of
442                  * the items in this log.
443                  */
444                 mode = btrfs_inode_mode(eb, item);
445                 if (S_ISDIR(mode))
446                         btrfs_set_inode_size(eb, item, 0);
447         }
448 insert:
449         btrfs_release_path(path);
450         /* try to insert the key into the destination tree */
451         ret = btrfs_insert_empty_item(trans, root, path,
452                                       key, item_size);
453
454         /* make sure any existing item is the correct size */
455         if (ret == -EEXIST) {
456                 u32 found_size;
457                 found_size = btrfs_item_size_nr(path->nodes[0],
458                                                 path->slots[0]);
459                 if (found_size > item_size)
460                         btrfs_truncate_item(root, path, item_size, 1);
461                 else if (found_size < item_size)
462                         btrfs_extend_item(root, path,
463                                           item_size - found_size);
464         } else if (ret) {
465                 return ret;
466         }
467         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
468                                         path->slots[0]);
469
470         /* don't overwrite an existing inode if the generation number
471          * was logged as zero.  This is done when the tree logging code
472          * is just logging an inode to make sure it exists after recovery.
473          *
474          * Also, don't overwrite i_size on directories during replay.
475          * log replay inserts and removes directory items based on the
476          * state of the tree found in the subvolume, and i_size is modified
477          * as it goes
478          */
479         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
480                 struct btrfs_inode_item *src_item;
481                 struct btrfs_inode_item *dst_item;
482
483                 src_item = (struct btrfs_inode_item *)src_ptr;
484                 dst_item = (struct btrfs_inode_item *)dst_ptr;
485
486                 if (btrfs_inode_generation(eb, src_item) == 0)
487                         goto no_copy;
488
489                 if (overwrite_root &&
490                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
491                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
492                         save_old_i_size = 1;
493                         saved_i_size = btrfs_inode_size(path->nodes[0],
494                                                         dst_item);
495                 }
496         }
497
498         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
499                            src_ptr, item_size);
500
501         if (save_old_i_size) {
502                 struct btrfs_inode_item *dst_item;
503                 dst_item = (struct btrfs_inode_item *)dst_ptr;
504                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
505         }
506
507         /* make sure the generation is filled in */
508         if (key->type == BTRFS_INODE_ITEM_KEY) {
509                 struct btrfs_inode_item *dst_item;
510                 dst_item = (struct btrfs_inode_item *)dst_ptr;
511                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
512                         btrfs_set_inode_generation(path->nodes[0], dst_item,
513                                                    trans->transid);
514                 }
515         }
516 no_copy:
517         btrfs_mark_buffer_dirty(path->nodes[0]);
518         btrfs_release_path(path);
519         return 0;
520 }
521
522 /*
523  * simple helper to read an inode off the disk from a given root
524  * This can only be called for subvolume roots and not for the log
525  */
526 static noinline struct inode *read_one_inode(struct btrfs_root *root,
527                                              u64 objectid)
528 {
529         struct btrfs_key key;
530         struct inode *inode;
531
532         key.objectid = objectid;
533         key.type = BTRFS_INODE_ITEM_KEY;
534         key.offset = 0;
535         inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
536         if (IS_ERR(inode)) {
537                 inode = NULL;
538         } else if (is_bad_inode(inode)) {
539                 iput(inode);
540                 inode = NULL;
541         }
542         return inode;
543 }
544
545 /* replays a single extent in 'eb' at 'slot' with 'key' into the
546  * subvolume 'root'.  path is released on entry and should be released
547  * on exit.
548  *
549  * extents in the log tree have not been allocated out of the extent
550  * tree yet.  So, this completes the allocation, taking a reference
551  * as required if the extent already exists or creating a new extent
552  * if it isn't in the extent allocation tree yet.
553  *
554  * The extent is inserted into the file, dropping any existing extents
555  * from the file that overlap the new one.
556  */
557 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
558                                       struct btrfs_root *root,
559                                       struct btrfs_path *path,
560                                       struct extent_buffer *eb, int slot,
561                                       struct btrfs_key *key)
562 {
563         int found_type;
564         u64 extent_end;
565         u64 start = key->offset;
566         u64 nbytes = 0;
567         struct btrfs_file_extent_item *item;
568         struct inode *inode = NULL;
569         unsigned long size;
570         int ret = 0;
571
572         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
573         found_type = btrfs_file_extent_type(eb, item);
574
575         if (found_type == BTRFS_FILE_EXTENT_REG ||
576             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
577                 nbytes = btrfs_file_extent_num_bytes(eb, item);
578                 extent_end = start + nbytes;
579
580                 /*
581                  * We don't add to the inodes nbytes if we are prealloc or a
582                  * hole.
583                  */
584                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
585                         nbytes = 0;
586         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
587                 size = btrfs_file_extent_inline_len(eb, slot, item);
588                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
589                 extent_end = ALIGN(start + size, root->sectorsize);
590         } else {
591                 ret = 0;
592                 goto out;
593         }
594
595         inode = read_one_inode(root, key->objectid);
596         if (!inode) {
597                 ret = -EIO;
598                 goto out;
599         }
600
601         /*
602          * first check to see if we already have this extent in the
603          * file.  This must be done before the btrfs_drop_extents run
604          * so we don't try to drop this extent.
605          */
606         ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
607                                        start, 0);
608
609         if (ret == 0 &&
610             (found_type == BTRFS_FILE_EXTENT_REG ||
611              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
612                 struct btrfs_file_extent_item cmp1;
613                 struct btrfs_file_extent_item cmp2;
614                 struct btrfs_file_extent_item *existing;
615                 struct extent_buffer *leaf;
616
617                 leaf = path->nodes[0];
618                 existing = btrfs_item_ptr(leaf, path->slots[0],
619                                           struct btrfs_file_extent_item);
620
621                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
622                                    sizeof(cmp1));
623                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
624                                    sizeof(cmp2));
625
626                 /*
627                  * we already have a pointer to this exact extent,
628                  * we don't have to do anything
629                  */
630                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
631                         btrfs_release_path(path);
632                         goto out;
633                 }
634         }
635         btrfs_release_path(path);
636
637         /* drop any overlapping extents */
638         ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
639         if (ret)
640                 goto out;
641
642         if (found_type == BTRFS_FILE_EXTENT_REG ||
643             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
644                 u64 offset;
645                 unsigned long dest_offset;
646                 struct btrfs_key ins;
647
648                 ret = btrfs_insert_empty_item(trans, root, path, key,
649                                               sizeof(*item));
650                 if (ret)
651                         goto out;
652                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
653                                                     path->slots[0]);
654                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
655                                 (unsigned long)item,  sizeof(*item));
656
657                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
658                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
659                 ins.type = BTRFS_EXTENT_ITEM_KEY;
660                 offset = key->offset - btrfs_file_extent_offset(eb, item);
661
662                 if (ins.objectid > 0) {
663                         u64 csum_start;
664                         u64 csum_end;
665                         LIST_HEAD(ordered_sums);
666                         /*
667                          * is this extent already allocated in the extent
668                          * allocation tree?  If so, just add a reference
669                          */
670                         ret = btrfs_lookup_extent(root, ins.objectid,
671                                                 ins.offset);
672                         if (ret == 0) {
673                                 ret = btrfs_inc_extent_ref(trans, root,
674                                                 ins.objectid, ins.offset,
675                                                 0, root->root_key.objectid,
676                                                 key->objectid, offset, 0);
677                                 if (ret)
678                                         goto out;
679                         } else {
680                                 /*
681                                  * insert the extent pointer in the extent
682                                  * allocation tree
683                                  */
684                                 ret = btrfs_alloc_logged_file_extent(trans,
685                                                 root, root->root_key.objectid,
686                                                 key->objectid, offset, &ins);
687                                 if (ret)
688                                         goto out;
689                         }
690                         btrfs_release_path(path);
691
692                         if (btrfs_file_extent_compression(eb, item)) {
693                                 csum_start = ins.objectid;
694                                 csum_end = csum_start + ins.offset;
695                         } else {
696                                 csum_start = ins.objectid +
697                                         btrfs_file_extent_offset(eb, item);
698                                 csum_end = csum_start +
699                                         btrfs_file_extent_num_bytes(eb, item);
700                         }
701
702                         ret = btrfs_lookup_csums_range(root->log_root,
703                                                 csum_start, csum_end - 1,
704                                                 &ordered_sums, 0);
705                         if (ret)
706                                 goto out;
707                         while (!list_empty(&ordered_sums)) {
708                                 struct btrfs_ordered_sum *sums;
709                                 sums = list_entry(ordered_sums.next,
710                                                 struct btrfs_ordered_sum,
711                                                 list);
712                                 if (!ret)
713                                         ret = btrfs_csum_file_blocks(trans,
714                                                 root->fs_info->csum_root,
715                                                 sums);
716                                 list_del(&sums->list);
717                                 kfree(sums);
718                         }
719                         if (ret)
720                                 goto out;
721                 } else {
722                         btrfs_release_path(path);
723                 }
724         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
725                 /* inline extents are easy, we just overwrite them */
726                 ret = overwrite_item(trans, root, path, eb, slot, key);
727                 if (ret)
728                         goto out;
729         }
730
731         inode_add_bytes(inode, nbytes);
732         ret = btrfs_update_inode(trans, root, inode);
733 out:
734         if (inode)
735                 iput(inode);
736         return ret;
737 }
738
739 /*
740  * when cleaning up conflicts between the directory names in the
741  * subvolume, directory names in the log and directory names in the
742  * inode back references, we may have to unlink inodes from directories.
743  *
744  * This is a helper function to do the unlink of a specific directory
745  * item
746  */
747 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
748                                       struct btrfs_root *root,
749                                       struct btrfs_path *path,
750                                       struct inode *dir,
751                                       struct btrfs_dir_item *di)
752 {
753         struct inode *inode;
754         char *name;
755         int name_len;
756         struct extent_buffer *leaf;
757         struct btrfs_key location;
758         int ret;
759
760         leaf = path->nodes[0];
761
762         btrfs_dir_item_key_to_cpu(leaf, di, &location);
763         name_len = btrfs_dir_name_len(leaf, di);
764         name = kmalloc(name_len, GFP_NOFS);
765         if (!name)
766                 return -ENOMEM;
767
768         read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
769         btrfs_release_path(path);
770
771         inode = read_one_inode(root, location.objectid);
772         if (!inode) {
773                 ret = -EIO;
774                 goto out;
775         }
776
777         ret = link_to_fixup_dir(trans, root, path, location.objectid);
778         if (ret)
779                 goto out;
780
781         ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
782         if (ret)
783                 goto out;
784         else
785                 ret = btrfs_run_delayed_items(trans, root);
786 out:
787         kfree(name);
788         iput(inode);
789         return ret;
790 }
791
792 /*
793  * helper function to see if a given name and sequence number found
794  * in an inode back reference are already in a directory and correctly
795  * point to this inode
796  */
797 static noinline int inode_in_dir(struct btrfs_root *root,
798                                  struct btrfs_path *path,
799                                  u64 dirid, u64 objectid, u64 index,
800                                  const char *name, int name_len)
801 {
802         struct btrfs_dir_item *di;
803         struct btrfs_key location;
804         int match = 0;
805
806         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
807                                          index, name, name_len, 0);
808         if (di && !IS_ERR(di)) {
809                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
810                 if (location.objectid != objectid)
811                         goto out;
812         } else
813                 goto out;
814         btrfs_release_path(path);
815
816         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
817         if (di && !IS_ERR(di)) {
818                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
819                 if (location.objectid != objectid)
820                         goto out;
821         } else
822                 goto out;
823         match = 1;
824 out:
825         btrfs_release_path(path);
826         return match;
827 }
828
829 /*
830  * helper function to check a log tree for a named back reference in
831  * an inode.  This is used to decide if a back reference that is
832  * found in the subvolume conflicts with what we find in the log.
833  *
834  * inode backreferences may have multiple refs in a single item,
835  * during replay we process one reference at a time, and we don't
836  * want to delete valid links to a file from the subvolume if that
837  * link is also in the log.
838  */
839 static noinline int backref_in_log(struct btrfs_root *log,
840                                    struct btrfs_key *key,
841                                    u64 ref_objectid,
842                                    char *name, int namelen)
843 {
844         struct btrfs_path *path;
845         struct btrfs_inode_ref *ref;
846         unsigned long ptr;
847         unsigned long ptr_end;
848         unsigned long name_ptr;
849         int found_name_len;
850         int item_size;
851         int ret;
852         int match = 0;
853
854         path = btrfs_alloc_path();
855         if (!path)
856                 return -ENOMEM;
857
858         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
859         if (ret != 0)
860                 goto out;
861
862         ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
863
864         if (key->type == BTRFS_INODE_EXTREF_KEY) {
865                 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
866                                                    name, namelen, NULL))
867                         match = 1;
868
869                 goto out;
870         }
871
872         item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
873         ptr_end = ptr + item_size;
874         while (ptr < ptr_end) {
875                 ref = (struct btrfs_inode_ref *)ptr;
876                 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
877                 if (found_name_len == namelen) {
878                         name_ptr = (unsigned long)(ref + 1);
879                         ret = memcmp_extent_buffer(path->nodes[0], name,
880                                                    name_ptr, namelen);
881                         if (ret == 0) {
882                                 match = 1;
883                                 goto out;
884                         }
885                 }
886                 ptr = (unsigned long)(ref + 1) + found_name_len;
887         }
888 out:
889         btrfs_free_path(path);
890         return match;
891 }
892
893 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
894                                   struct btrfs_root *root,
895                                   struct btrfs_path *path,
896                                   struct btrfs_root *log_root,
897                                   struct inode *dir, struct inode *inode,
898                                   struct extent_buffer *eb,
899                                   u64 inode_objectid, u64 parent_objectid,
900                                   u64 ref_index, char *name, int namelen,
901                                   int *search_done)
902 {
903         int ret;
904         char *victim_name;
905         int victim_name_len;
906         struct extent_buffer *leaf;
907         struct btrfs_dir_item *di;
908         struct btrfs_key search_key;
909         struct btrfs_inode_extref *extref;
910
911 again:
912         /* Search old style refs */
913         search_key.objectid = inode_objectid;
914         search_key.type = BTRFS_INODE_REF_KEY;
915         search_key.offset = parent_objectid;
916         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
917         if (ret == 0) {
918                 struct btrfs_inode_ref *victim_ref;
919                 unsigned long ptr;
920                 unsigned long ptr_end;
921
922                 leaf = path->nodes[0];
923
924                 /* are we trying to overwrite a back ref for the root directory
925                  * if so, just jump out, we're done
926                  */
927                 if (search_key.objectid == search_key.offset)
928                         return 1;
929
930                 /* check all the names in this back reference to see
931                  * if they are in the log.  if so, we allow them to stay
932                  * otherwise they must be unlinked as a conflict
933                  */
934                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
935                 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
936                 while (ptr < ptr_end) {
937                         victim_ref = (struct btrfs_inode_ref *)ptr;
938                         victim_name_len = btrfs_inode_ref_name_len(leaf,
939                                                                    victim_ref);
940                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
941                         if (!victim_name)
942                                 return -ENOMEM;
943
944                         read_extent_buffer(leaf, victim_name,
945                                            (unsigned long)(victim_ref + 1),
946                                            victim_name_len);
947
948                         if (!backref_in_log(log_root, &search_key,
949                                             parent_objectid,
950                                             victim_name,
951                                             victim_name_len)) {
952                                 inc_nlink(inode);
953                                 btrfs_release_path(path);
954
955                                 ret = btrfs_unlink_inode(trans, root, dir,
956                                                          inode, victim_name,
957                                                          victim_name_len);
958                                 kfree(victim_name);
959                                 if (ret)
960                                         return ret;
961                                 ret = btrfs_run_delayed_items(trans, root);
962                                 if (ret)
963                                         return ret;
964                                 *search_done = 1;
965                                 goto again;
966                         }
967                         kfree(victim_name);
968
969                         ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
970                 }
971
972                 /*
973                  * NOTE: we have searched root tree and checked the
974                  * coresponding ref, it does not need to check again.
975                  */
976                 *search_done = 1;
977         }
978         btrfs_release_path(path);
979
980         /* Same search but for extended refs */
981         extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
982                                            inode_objectid, parent_objectid, 0,
983                                            0);
984         if (!IS_ERR_OR_NULL(extref)) {
985                 u32 item_size;
986                 u32 cur_offset = 0;
987                 unsigned long base;
988                 struct inode *victim_parent;
989
990                 leaf = path->nodes[0];
991
992                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
993                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
994
995                 while (cur_offset < item_size) {
996                         extref = (struct btrfs_inode_extref *)base + cur_offset;
997
998                         victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
999
1000                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1001                                 goto next;
1002
1003                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1004                         if (!victim_name)
1005                                 return -ENOMEM;
1006                         read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1007                                            victim_name_len);
1008
1009                         search_key.objectid = inode_objectid;
1010                         search_key.type = BTRFS_INODE_EXTREF_KEY;
1011                         search_key.offset = btrfs_extref_hash(parent_objectid,
1012                                                               victim_name,
1013                                                               victim_name_len);
1014                         ret = 0;
1015                         if (!backref_in_log(log_root, &search_key,
1016                                             parent_objectid, victim_name,
1017                                             victim_name_len)) {
1018                                 ret = -ENOENT;
1019                                 victim_parent = read_one_inode(root,
1020                                                                parent_objectid);
1021                                 if (victim_parent) {
1022                                         inc_nlink(inode);
1023                                         btrfs_release_path(path);
1024
1025                                         ret = btrfs_unlink_inode(trans, root,
1026                                                                  victim_parent,
1027                                                                  inode,
1028                                                                  victim_name,
1029                                                                  victim_name_len);
1030                                         if (!ret)
1031                                                 ret = btrfs_run_delayed_items(
1032                                                                   trans, root);
1033                                 }
1034                                 iput(victim_parent);
1035                                 kfree(victim_name);
1036                                 if (ret)
1037                                         return ret;
1038                                 *search_done = 1;
1039                                 goto again;
1040                         }
1041                         kfree(victim_name);
1042                         if (ret)
1043                                 return ret;
1044 next:
1045                         cur_offset += victim_name_len + sizeof(*extref);
1046                 }
1047                 *search_done = 1;
1048         }
1049         btrfs_release_path(path);
1050
1051         /* look for a conflicting sequence number */
1052         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1053                                          ref_index, name, namelen, 0);
1054         if (di && !IS_ERR(di)) {
1055                 ret = drop_one_dir_item(trans, root, path, dir, di);
1056                 if (ret)
1057                         return ret;
1058         }
1059         btrfs_release_path(path);
1060
1061         /* look for a conflicing name */
1062         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1063                                    name, namelen, 0);
1064         if (di && !IS_ERR(di)) {
1065                 ret = drop_one_dir_item(trans, root, path, dir, di);
1066                 if (ret)
1067                         return ret;
1068         }
1069         btrfs_release_path(path);
1070
1071         return 0;
1072 }
1073
1074 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1075                              u32 *namelen, char **name, u64 *index,
1076                              u64 *parent_objectid)
1077 {
1078         struct btrfs_inode_extref *extref;
1079
1080         extref = (struct btrfs_inode_extref *)ref_ptr;
1081
1082         *namelen = btrfs_inode_extref_name_len(eb, extref);
1083         *name = kmalloc(*namelen, GFP_NOFS);
1084         if (*name == NULL)
1085                 return -ENOMEM;
1086
1087         read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1088                            *namelen);
1089
1090         *index = btrfs_inode_extref_index(eb, extref);
1091         if (parent_objectid)
1092                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1093
1094         return 0;
1095 }
1096
1097 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1098                           u32 *namelen, char **name, u64 *index)
1099 {
1100         struct btrfs_inode_ref *ref;
1101
1102         ref = (struct btrfs_inode_ref *)ref_ptr;
1103
1104         *namelen = btrfs_inode_ref_name_len(eb, ref);
1105         *name = kmalloc(*namelen, GFP_NOFS);
1106         if (*name == NULL)
1107                 return -ENOMEM;
1108
1109         read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1110
1111         *index = btrfs_inode_ref_index(eb, ref);
1112
1113         return 0;
1114 }
1115
1116 /*
1117  * replay one inode back reference item found in the log tree.
1118  * eb, slot and key refer to the buffer and key found in the log tree.
1119  * root is the destination we are replaying into, and path is for temp
1120  * use by this function.  (it should be released on return).
1121  */
1122 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1123                                   struct btrfs_root *root,
1124                                   struct btrfs_root *log,
1125                                   struct btrfs_path *path,
1126                                   struct extent_buffer *eb, int slot,
1127                                   struct btrfs_key *key)
1128 {
1129         struct inode *dir = NULL;
1130         struct inode *inode = NULL;
1131         unsigned long ref_ptr;
1132         unsigned long ref_end;
1133         char *name = NULL;
1134         int namelen;
1135         int ret;
1136         int search_done = 0;
1137         int log_ref_ver = 0;
1138         u64 parent_objectid;
1139         u64 inode_objectid;
1140         u64 ref_index = 0;
1141         int ref_struct_size;
1142
1143         ref_ptr = btrfs_item_ptr_offset(eb, slot);
1144         ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1145
1146         if (key->type == BTRFS_INODE_EXTREF_KEY) {
1147                 struct btrfs_inode_extref *r;
1148
1149                 ref_struct_size = sizeof(struct btrfs_inode_extref);
1150                 log_ref_ver = 1;
1151                 r = (struct btrfs_inode_extref *)ref_ptr;
1152                 parent_objectid = btrfs_inode_extref_parent(eb, r);
1153         } else {
1154                 ref_struct_size = sizeof(struct btrfs_inode_ref);
1155                 parent_objectid = key->offset;
1156         }
1157         inode_objectid = key->objectid;
1158
1159         /*
1160          * it is possible that we didn't log all the parent directories
1161          * for a given inode.  If we don't find the dir, just don't
1162          * copy the back ref in.  The link count fixup code will take
1163          * care of the rest
1164          */
1165         dir = read_one_inode(root, parent_objectid);
1166         if (!dir) {
1167                 ret = -ENOENT;
1168                 goto out;
1169         }
1170
1171         inode = read_one_inode(root, inode_objectid);
1172         if (!inode) {
1173                 ret = -EIO;
1174                 goto out;
1175         }
1176
1177         while (ref_ptr < ref_end) {
1178                 if (log_ref_ver) {
1179                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1180                                                 &ref_index, &parent_objectid);
1181                         /*
1182                          * parent object can change from one array
1183                          * item to another.
1184                          */
1185                         if (!dir)
1186                                 dir = read_one_inode(root, parent_objectid);
1187                         if (!dir) {
1188                                 ret = -ENOENT;
1189                                 goto out;
1190                         }
1191                 } else {
1192                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1193                                              &ref_index);
1194                 }
1195                 if (ret)
1196                         goto out;
1197
1198                 /* if we already have a perfect match, we're done */
1199                 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1200                                   ref_index, name, namelen)) {
1201                         /*
1202                          * look for a conflicting back reference in the
1203                          * metadata. if we find one we have to unlink that name
1204                          * of the file before we add our new link.  Later on, we
1205                          * overwrite any existing back reference, and we don't
1206                          * want to create dangling pointers in the directory.
1207                          */
1208
1209                         if (!search_done) {
1210                                 ret = __add_inode_ref(trans, root, path, log,
1211                                                       dir, inode, eb,
1212                                                       inode_objectid,
1213                                                       parent_objectid,
1214                                                       ref_index, name, namelen,
1215                                                       &search_done);
1216                                 if (ret) {
1217                                         if (ret == 1)
1218                                                 ret = 0;
1219                                         goto out;
1220                                 }
1221                         }
1222
1223                         /* insert our name */
1224                         ret = btrfs_add_link(trans, dir, inode, name, namelen,
1225                                              0, ref_index);
1226                         if (ret)
1227                                 goto out;
1228
1229                         btrfs_update_inode(trans, root, inode);
1230                 }
1231
1232                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1233                 kfree(name);
1234                 name = NULL;
1235                 if (log_ref_ver) {
1236                         iput(dir);
1237                         dir = NULL;
1238                 }
1239         }
1240
1241         /* finally write the back reference in the inode */
1242         ret = overwrite_item(trans, root, path, eb, slot, key);
1243 out:
1244         btrfs_release_path(path);
1245         kfree(name);
1246         iput(dir);
1247         iput(inode);
1248         return ret;
1249 }
1250
1251 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1252                               struct btrfs_root *root, u64 offset)
1253 {
1254         int ret;
1255         ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
1256                         offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
1257         if (ret > 0)
1258                 ret = btrfs_insert_orphan_item(trans, root, offset);
1259         return ret;
1260 }
1261
1262 static int count_inode_extrefs(struct btrfs_root *root,
1263                                struct inode *inode, struct btrfs_path *path)
1264 {
1265         int ret = 0;
1266         int name_len;
1267         unsigned int nlink = 0;
1268         u32 item_size;
1269         u32 cur_offset = 0;
1270         u64 inode_objectid = btrfs_ino(inode);
1271         u64 offset = 0;
1272         unsigned long ptr;
1273         struct btrfs_inode_extref *extref;
1274         struct extent_buffer *leaf;
1275
1276         while (1) {
1277                 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1278                                             &extref, &offset);
1279                 if (ret)
1280                         break;
1281
1282                 leaf = path->nodes[0];
1283                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1284                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1285
1286                 while (cur_offset < item_size) {
1287                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1288                         name_len = btrfs_inode_extref_name_len(leaf, extref);
1289
1290                         nlink++;
1291
1292                         cur_offset += name_len + sizeof(*extref);
1293                 }
1294
1295                 offset++;
1296                 btrfs_release_path(path);
1297         }
1298         btrfs_release_path(path);
1299
1300         if (ret < 0)
1301                 return ret;
1302         return nlink;
1303 }
1304
1305 static int count_inode_refs(struct btrfs_root *root,
1306                                struct inode *inode, struct btrfs_path *path)
1307 {
1308         int ret;
1309         struct btrfs_key key;
1310         unsigned int nlink = 0;
1311         unsigned long ptr;
1312         unsigned long ptr_end;
1313         int name_len;
1314         u64 ino = btrfs_ino(inode);
1315
1316         key.objectid = ino;
1317         key.type = BTRFS_INODE_REF_KEY;
1318         key.offset = (u64)-1;
1319
1320         while (1) {
1321                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1322                 if (ret < 0)
1323                         break;
1324                 if (ret > 0) {
1325                         if (path->slots[0] == 0)
1326                                 break;
1327                         path->slots[0]--;
1328                 }
1329 process_slot:
1330                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1331                                       path->slots[0]);
1332                 if (key.objectid != ino ||
1333                     key.type != BTRFS_INODE_REF_KEY)
1334                         break;
1335                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1336                 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1337                                                    path->slots[0]);
1338                 while (ptr < ptr_end) {
1339                         struct btrfs_inode_ref *ref;
1340
1341                         ref = (struct btrfs_inode_ref *)ptr;
1342                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1343                                                             ref);
1344                         ptr = (unsigned long)(ref + 1) + name_len;
1345                         nlink++;
1346                 }
1347
1348                 if (key.offset == 0)
1349                         break;
1350                 if (path->slots[0] > 0) {
1351                         path->slots[0]--;
1352                         goto process_slot;
1353                 }
1354                 key.offset--;
1355                 btrfs_release_path(path);
1356         }
1357         btrfs_release_path(path);
1358
1359         return nlink;
1360 }
1361
1362 /*
1363  * There are a few corners where the link count of the file can't
1364  * be properly maintained during replay.  So, instead of adding
1365  * lots of complexity to the log code, we just scan the backrefs
1366  * for any file that has been through replay.
1367  *
1368  * The scan will update the link count on the inode to reflect the
1369  * number of back refs found.  If it goes down to zero, the iput
1370  * will free the inode.
1371  */
1372 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1373                                            struct btrfs_root *root,
1374                                            struct inode *inode)
1375 {
1376         struct btrfs_path *path;
1377         int ret;
1378         u64 nlink = 0;
1379         u64 ino = btrfs_ino(inode);
1380
1381         path = btrfs_alloc_path();
1382         if (!path)
1383                 return -ENOMEM;
1384
1385         ret = count_inode_refs(root, inode, path);
1386         if (ret < 0)
1387                 goto out;
1388
1389         nlink = ret;
1390
1391         ret = count_inode_extrefs(root, inode, path);
1392         if (ret == -ENOENT)
1393                 ret = 0;
1394
1395         if (ret < 0)
1396                 goto out;
1397
1398         nlink += ret;
1399
1400         ret = 0;
1401
1402         if (nlink != inode->i_nlink) {
1403                 set_nlink(inode, nlink);
1404                 btrfs_update_inode(trans, root, inode);
1405         }
1406         BTRFS_I(inode)->index_cnt = (u64)-1;
1407
1408         if (inode->i_nlink == 0) {
1409                 if (S_ISDIR(inode->i_mode)) {
1410                         ret = replay_dir_deletes(trans, root, NULL, path,
1411                                                  ino, 1);
1412                         if (ret)
1413                                 goto out;
1414                 }
1415                 ret = insert_orphan_item(trans, root, ino);
1416         }
1417
1418 out:
1419         btrfs_free_path(path);
1420         return ret;
1421 }
1422
1423 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1424                                             struct btrfs_root *root,
1425                                             struct btrfs_path *path)
1426 {
1427         int ret;
1428         struct btrfs_key key;
1429         struct inode *inode;
1430
1431         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1432         key.type = BTRFS_ORPHAN_ITEM_KEY;
1433         key.offset = (u64)-1;
1434         while (1) {
1435                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1436                 if (ret < 0)
1437                         break;
1438
1439                 if (ret == 1) {
1440                         if (path->slots[0] == 0)
1441                                 break;
1442                         path->slots[0]--;
1443                 }
1444
1445                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1446                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1447                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1448                         break;
1449
1450                 ret = btrfs_del_item(trans, root, path);
1451                 if (ret)
1452                         goto out;
1453
1454                 btrfs_release_path(path);
1455                 inode = read_one_inode(root, key.offset);
1456                 if (!inode)
1457                         return -EIO;
1458
1459                 ret = fixup_inode_link_count(trans, root, inode);
1460                 iput(inode);
1461                 if (ret)
1462                         goto out;
1463
1464                 /*
1465                  * fixup on a directory may create new entries,
1466                  * make sure we always look for the highset possible
1467                  * offset
1468                  */
1469                 key.offset = (u64)-1;
1470         }
1471         ret = 0;
1472 out:
1473         btrfs_release_path(path);
1474         return ret;
1475 }
1476
1477
1478 /*
1479  * record a given inode in the fixup dir so we can check its link
1480  * count when replay is done.  The link count is incremented here
1481  * so the inode won't go away until we check it
1482  */
1483 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1484                                       struct btrfs_root *root,
1485                                       struct btrfs_path *path,
1486                                       u64 objectid)
1487 {
1488         struct btrfs_key key;
1489         int ret = 0;
1490         struct inode *inode;
1491
1492         inode = read_one_inode(root, objectid);
1493         if (!inode)
1494                 return -EIO;
1495
1496         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1497         btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1498         key.offset = objectid;
1499
1500         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1501
1502         btrfs_release_path(path);
1503         if (ret == 0) {
1504                 if (!inode->i_nlink)
1505                         set_nlink(inode, 1);
1506                 else
1507                         inc_nlink(inode);
1508                 ret = btrfs_update_inode(trans, root, inode);
1509         } else if (ret == -EEXIST) {
1510                 ret = 0;
1511         } else {
1512                 BUG(); /* Logic Error */
1513         }
1514         iput(inode);
1515
1516         return ret;
1517 }
1518
1519 /*
1520  * when replaying the log for a directory, we only insert names
1521  * for inodes that actually exist.  This means an fsync on a directory
1522  * does not implicitly fsync all the new files in it
1523  */
1524 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1525                                     struct btrfs_root *root,
1526                                     struct btrfs_path *path,
1527                                     u64 dirid, u64 index,
1528                                     char *name, int name_len, u8 type,
1529                                     struct btrfs_key *location)
1530 {
1531         struct inode *inode;
1532         struct inode *dir;
1533         int ret;
1534
1535         inode = read_one_inode(root, location->objectid);
1536         if (!inode)
1537                 return -ENOENT;
1538
1539         dir = read_one_inode(root, dirid);
1540         if (!dir) {
1541                 iput(inode);
1542                 return -EIO;
1543         }
1544
1545         ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1546
1547         /* FIXME, put inode into FIXUP list */
1548
1549         iput(inode);
1550         iput(dir);
1551         return ret;
1552 }
1553
1554 /*
1555  * take a single entry in a log directory item and replay it into
1556  * the subvolume.
1557  *
1558  * if a conflicting item exists in the subdirectory already,
1559  * the inode it points to is unlinked and put into the link count
1560  * fix up tree.
1561  *
1562  * If a name from the log points to a file or directory that does
1563  * not exist in the FS, it is skipped.  fsyncs on directories
1564  * do not force down inodes inside that directory, just changes to the
1565  * names or unlinks in a directory.
1566  */
1567 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1568                                     struct btrfs_root *root,
1569                                     struct btrfs_path *path,
1570                                     struct extent_buffer *eb,
1571                                     struct btrfs_dir_item *di,
1572                                     struct btrfs_key *key)
1573 {
1574         char *name;
1575         int name_len;
1576         struct btrfs_dir_item *dst_di;
1577         struct btrfs_key found_key;
1578         struct btrfs_key log_key;
1579         struct inode *dir;
1580         u8 log_type;
1581         int exists;
1582         int ret = 0;
1583         bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1584
1585         dir = read_one_inode(root, key->objectid);
1586         if (!dir)
1587                 return -EIO;
1588
1589         name_len = btrfs_dir_name_len(eb, di);
1590         name = kmalloc(name_len, GFP_NOFS);
1591         if (!name) {
1592                 ret = -ENOMEM;
1593                 goto out;
1594         }
1595
1596         log_type = btrfs_dir_type(eb, di);
1597         read_extent_buffer(eb, name, (unsigned long)(di + 1),
1598                    name_len);
1599
1600         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1601         exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1602         if (exists == 0)
1603                 exists = 1;
1604         else
1605                 exists = 0;
1606         btrfs_release_path(path);
1607
1608         if (key->type == BTRFS_DIR_ITEM_KEY) {
1609                 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1610                                        name, name_len, 1);
1611         } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1612                 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1613                                                      key->objectid,
1614                                                      key->offset, name,
1615                                                      name_len, 1);
1616         } else {
1617                 /* Corruption */
1618                 ret = -EINVAL;
1619                 goto out;
1620         }
1621         if (IS_ERR_OR_NULL(dst_di)) {
1622                 /* we need a sequence number to insert, so we only
1623                  * do inserts for the BTRFS_DIR_INDEX_KEY types
1624                  */
1625                 if (key->type != BTRFS_DIR_INDEX_KEY)
1626                         goto out;
1627                 goto insert;
1628         }
1629
1630         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1631         /* the existing item matches the logged item */
1632         if (found_key.objectid == log_key.objectid &&
1633             found_key.type == log_key.type &&
1634             found_key.offset == log_key.offset &&
1635             btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1636                 goto out;
1637         }
1638
1639         /*
1640          * don't drop the conflicting directory entry if the inode
1641          * for the new entry doesn't exist
1642          */
1643         if (!exists)
1644                 goto out;
1645
1646         ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1647         if (ret)
1648                 goto out;
1649
1650         if (key->type == BTRFS_DIR_INDEX_KEY)
1651                 goto insert;
1652 out:
1653         btrfs_release_path(path);
1654         if (!ret && update_size) {
1655                 btrfs_i_size_write(dir, dir->i_size + name_len * 2);
1656                 ret = btrfs_update_inode(trans, root, dir);
1657         }
1658         kfree(name);
1659         iput(dir);
1660         return ret;
1661
1662 insert:
1663         btrfs_release_path(path);
1664         ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1665                               name, name_len, log_type, &log_key);
1666         if (ret && ret != -ENOENT)
1667                 goto out;
1668         update_size = false;
1669         ret = 0;
1670         goto out;
1671 }
1672
1673 /*
1674  * find all the names in a directory item and reconcile them into
1675  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1676  * one name in a directory item, but the same code gets used for
1677  * both directory index types
1678  */
1679 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1680                                         struct btrfs_root *root,
1681                                         struct btrfs_path *path,
1682                                         struct extent_buffer *eb, int slot,
1683                                         struct btrfs_key *key)
1684 {
1685         int ret;
1686         u32 item_size = btrfs_item_size_nr(eb, slot);
1687         struct btrfs_dir_item *di;
1688         int name_len;
1689         unsigned long ptr;
1690         unsigned long ptr_end;
1691
1692         ptr = btrfs_item_ptr_offset(eb, slot);
1693         ptr_end = ptr + item_size;
1694         while (ptr < ptr_end) {
1695                 di = (struct btrfs_dir_item *)ptr;
1696                 if (verify_dir_item(root, eb, di))
1697                         return -EIO;
1698                 name_len = btrfs_dir_name_len(eb, di);
1699                 ret = replay_one_name(trans, root, path, eb, di, key);
1700                 if (ret)
1701                         return ret;
1702                 ptr = (unsigned long)(di + 1);
1703                 ptr += name_len;
1704         }
1705         return 0;
1706 }
1707
1708 /*
1709  * directory replay has two parts.  There are the standard directory
1710  * items in the log copied from the subvolume, and range items
1711  * created in the log while the subvolume was logged.
1712  *
1713  * The range items tell us which parts of the key space the log
1714  * is authoritative for.  During replay, if a key in the subvolume
1715  * directory is in a logged range item, but not actually in the log
1716  * that means it was deleted from the directory before the fsync
1717  * and should be removed.
1718  */
1719 static noinline int find_dir_range(struct btrfs_root *root,
1720                                    struct btrfs_path *path,
1721                                    u64 dirid, int key_type,
1722                                    u64 *start_ret, u64 *end_ret)
1723 {
1724         struct btrfs_key key;
1725         u64 found_end;
1726         struct btrfs_dir_log_item *item;
1727         int ret;
1728         int nritems;
1729
1730         if (*start_ret == (u64)-1)
1731                 return 1;
1732
1733         key.objectid = dirid;
1734         key.type = key_type;
1735         key.offset = *start_ret;
1736
1737         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1738         if (ret < 0)
1739                 goto out;
1740         if (ret > 0) {
1741                 if (path->slots[0] == 0)
1742                         goto out;
1743                 path->slots[0]--;
1744         }
1745         if (ret != 0)
1746                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1747
1748         if (key.type != key_type || key.objectid != dirid) {
1749                 ret = 1;
1750                 goto next;
1751         }
1752         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1753                               struct btrfs_dir_log_item);
1754         found_end = btrfs_dir_log_end(path->nodes[0], item);
1755
1756         if (*start_ret >= key.offset && *start_ret <= found_end) {
1757                 ret = 0;
1758                 *start_ret = key.offset;
1759                 *end_ret = found_end;
1760                 goto out;
1761         }
1762         ret = 1;
1763 next:
1764         /* check the next slot in the tree to see if it is a valid item */
1765         nritems = btrfs_header_nritems(path->nodes[0]);
1766         if (path->slots[0] >= nritems) {
1767                 ret = btrfs_next_leaf(root, path);
1768                 if (ret)
1769                         goto out;
1770         } else {
1771                 path->slots[0]++;
1772         }
1773
1774         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1775
1776         if (key.type != key_type || key.objectid != dirid) {
1777                 ret = 1;
1778                 goto out;
1779         }
1780         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1781                               struct btrfs_dir_log_item);
1782         found_end = btrfs_dir_log_end(path->nodes[0], item);
1783         *start_ret = key.offset;
1784         *end_ret = found_end;
1785         ret = 0;
1786 out:
1787         btrfs_release_path(path);
1788         return ret;
1789 }
1790
1791 /*
1792  * this looks for a given directory item in the log.  If the directory
1793  * item is not in the log, the item is removed and the inode it points
1794  * to is unlinked
1795  */
1796 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1797                                       struct btrfs_root *root,
1798                                       struct btrfs_root *log,
1799                                       struct btrfs_path *path,
1800                                       struct btrfs_path *log_path,
1801                                       struct inode *dir,
1802                                       struct btrfs_key *dir_key)
1803 {
1804         int ret;
1805         struct extent_buffer *eb;
1806         int slot;
1807         u32 item_size;
1808         struct btrfs_dir_item *di;
1809         struct btrfs_dir_item *log_di;
1810         int name_len;
1811         unsigned long ptr;
1812         unsigned long ptr_end;
1813         char *name;
1814         struct inode *inode;
1815         struct btrfs_key location;
1816
1817 again:
1818         eb = path->nodes[0];
1819         slot = path->slots[0];
1820         item_size = btrfs_item_size_nr(eb, slot);
1821         ptr = btrfs_item_ptr_offset(eb, slot);
1822         ptr_end = ptr + item_size;
1823         while (ptr < ptr_end) {
1824                 di = (struct btrfs_dir_item *)ptr;
1825                 if (verify_dir_item(root, eb, di)) {
1826                         ret = -EIO;
1827                         goto out;
1828                 }
1829
1830                 name_len = btrfs_dir_name_len(eb, di);
1831                 name = kmalloc(name_len, GFP_NOFS);
1832                 if (!name) {
1833                         ret = -ENOMEM;
1834                         goto out;
1835                 }
1836                 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1837                                   name_len);
1838                 log_di = NULL;
1839                 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1840                         log_di = btrfs_lookup_dir_item(trans, log, log_path,
1841                                                        dir_key->objectid,
1842                                                        name, name_len, 0);
1843                 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1844                         log_di = btrfs_lookup_dir_index_item(trans, log,
1845                                                      log_path,
1846                                                      dir_key->objectid,
1847                                                      dir_key->offset,
1848                                                      name, name_len, 0);
1849                 }
1850                 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
1851                         btrfs_dir_item_key_to_cpu(eb, di, &location);
1852                         btrfs_release_path(path);
1853                         btrfs_release_path(log_path);
1854                         inode = read_one_inode(root, location.objectid);
1855                         if (!inode) {
1856                                 kfree(name);
1857                                 return -EIO;
1858                         }
1859
1860                         ret = link_to_fixup_dir(trans, root,
1861                                                 path, location.objectid);
1862                         if (ret) {
1863                                 kfree(name);
1864                                 iput(inode);
1865                                 goto out;
1866                         }
1867
1868                         inc_nlink(inode);
1869                         ret = btrfs_unlink_inode(trans, root, dir, inode,
1870                                                  name, name_len);
1871                         if (!ret)
1872                                 ret = btrfs_run_delayed_items(trans, root);
1873                         kfree(name);
1874                         iput(inode);
1875                         if (ret)
1876                                 goto out;
1877
1878                         /* there might still be more names under this key
1879                          * check and repeat if required
1880                          */
1881                         ret = btrfs_search_slot(NULL, root, dir_key, path,
1882                                                 0, 0);
1883                         if (ret == 0)
1884                                 goto again;
1885                         ret = 0;
1886                         goto out;
1887                 } else if (IS_ERR(log_di)) {
1888                         kfree(name);
1889                         return PTR_ERR(log_di);
1890                 }
1891                 btrfs_release_path(log_path);
1892                 kfree(name);
1893
1894                 ptr = (unsigned long)(di + 1);
1895                 ptr += name_len;
1896         }
1897         ret = 0;
1898 out:
1899         btrfs_release_path(path);
1900         btrfs_release_path(log_path);
1901         return ret;
1902 }
1903
1904 /*
1905  * deletion replay happens before we copy any new directory items
1906  * out of the log or out of backreferences from inodes.  It
1907  * scans the log to find ranges of keys that log is authoritative for,
1908  * and then scans the directory to find items in those ranges that are
1909  * not present in the log.
1910  *
1911  * Anything we don't find in the log is unlinked and removed from the
1912  * directory.
1913  */
1914 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1915                                        struct btrfs_root *root,
1916                                        struct btrfs_root *log,
1917                                        struct btrfs_path *path,
1918                                        u64 dirid, int del_all)
1919 {
1920         u64 range_start;
1921         u64 range_end;
1922         int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1923         int ret = 0;
1924         struct btrfs_key dir_key;
1925         struct btrfs_key found_key;
1926         struct btrfs_path *log_path;
1927         struct inode *dir;
1928
1929         dir_key.objectid = dirid;
1930         dir_key.type = BTRFS_DIR_ITEM_KEY;
1931         log_path = btrfs_alloc_path();
1932         if (!log_path)
1933                 return -ENOMEM;
1934
1935         dir = read_one_inode(root, dirid);
1936         /* it isn't an error if the inode isn't there, that can happen
1937          * because we replay the deletes before we copy in the inode item
1938          * from the log
1939          */
1940         if (!dir) {
1941                 btrfs_free_path(log_path);
1942                 return 0;
1943         }
1944 again:
1945         range_start = 0;
1946         range_end = 0;
1947         while (1) {
1948                 if (del_all)
1949                         range_end = (u64)-1;
1950                 else {
1951                         ret = find_dir_range(log, path, dirid, key_type,
1952                                              &range_start, &range_end);
1953                         if (ret != 0)
1954                                 break;
1955                 }
1956
1957                 dir_key.offset = range_start;
1958                 while (1) {
1959                         int nritems;
1960                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
1961                                                 0, 0);
1962                         if (ret < 0)
1963                                 goto out;
1964
1965                         nritems = btrfs_header_nritems(path->nodes[0]);
1966                         if (path->slots[0] >= nritems) {
1967                                 ret = btrfs_next_leaf(root, path);
1968                                 if (ret)
1969                                         break;
1970                         }
1971                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1972                                               path->slots[0]);
1973                         if (found_key.objectid != dirid ||
1974                             found_key.type != dir_key.type)
1975                                 goto next_type;
1976
1977                         if (found_key.offset > range_end)
1978                                 break;
1979
1980                         ret = check_item_in_log(trans, root, log, path,
1981                                                 log_path, dir,
1982                                                 &found_key);
1983                         if (ret)
1984                                 goto out;
1985                         if (found_key.offset == (u64)-1)
1986                                 break;
1987                         dir_key.offset = found_key.offset + 1;
1988                 }
1989                 btrfs_release_path(path);
1990                 if (range_end == (u64)-1)
1991                         break;
1992                 range_start = range_end + 1;
1993         }
1994
1995 next_type:
1996         ret = 0;
1997         if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1998                 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1999                 dir_key.type = BTRFS_DIR_INDEX_KEY;
2000                 btrfs_release_path(path);
2001                 goto again;
2002         }
2003 out:
2004         btrfs_release_path(path);
2005         btrfs_free_path(log_path);
2006         iput(dir);
2007         return ret;
2008 }
2009
2010 /*
2011  * the process_func used to replay items from the log tree.  This
2012  * gets called in two different stages.  The first stage just looks
2013  * for inodes and makes sure they are all copied into the subvolume.
2014  *
2015  * The second stage copies all the other item types from the log into
2016  * the subvolume.  The two stage approach is slower, but gets rid of
2017  * lots of complexity around inodes referencing other inodes that exist
2018  * only in the log (references come from either directory items or inode
2019  * back refs).
2020  */
2021 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2022                              struct walk_control *wc, u64 gen)
2023 {
2024         int nritems;
2025         struct btrfs_path *path;
2026         struct btrfs_root *root = wc->replay_dest;
2027         struct btrfs_key key;
2028         int level;
2029         int i;
2030         int ret;
2031
2032         ret = btrfs_read_buffer(eb, gen);
2033         if (ret)
2034                 return ret;
2035
2036         level = btrfs_header_level(eb);
2037
2038         if (level != 0)
2039                 return 0;
2040
2041         path = btrfs_alloc_path();
2042         if (!path)
2043                 return -ENOMEM;
2044
2045         nritems = btrfs_header_nritems(eb);
2046         for (i = 0; i < nritems; i++) {
2047                 btrfs_item_key_to_cpu(eb, &key, i);
2048
2049                 /* inode keys are done during the first stage */
2050                 if (key.type == BTRFS_INODE_ITEM_KEY &&
2051                     wc->stage == LOG_WALK_REPLAY_INODES) {
2052                         struct btrfs_inode_item *inode_item;
2053                         u32 mode;
2054
2055                         inode_item = btrfs_item_ptr(eb, i,
2056                                             struct btrfs_inode_item);
2057                         mode = btrfs_inode_mode(eb, inode_item);
2058                         if (S_ISDIR(mode)) {
2059                                 ret = replay_dir_deletes(wc->trans,
2060                                          root, log, path, key.objectid, 0);
2061                                 if (ret)
2062                                         break;
2063                         }
2064                         ret = overwrite_item(wc->trans, root, path,
2065                                              eb, i, &key);
2066                         if (ret)
2067                                 break;
2068
2069                         /* for regular files, make sure corresponding
2070                          * orhpan item exist. extents past the new EOF
2071                          * will be truncated later by orphan cleanup.
2072                          */
2073                         if (S_ISREG(mode)) {
2074                                 ret = insert_orphan_item(wc->trans, root,
2075                                                          key.objectid);
2076                                 if (ret)
2077                                         break;
2078                         }
2079
2080                         ret = link_to_fixup_dir(wc->trans, root,
2081                                                 path, key.objectid);
2082                         if (ret)
2083                                 break;
2084                 }
2085
2086                 if (key.type == BTRFS_DIR_INDEX_KEY &&
2087                     wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2088                         ret = replay_one_dir_item(wc->trans, root, path,
2089                                                   eb, i, &key);
2090                         if (ret)
2091                                 break;
2092                 }
2093
2094                 if (wc->stage < LOG_WALK_REPLAY_ALL)
2095                         continue;
2096
2097                 /* these keys are simply copied */
2098                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2099                         ret = overwrite_item(wc->trans, root, path,
2100                                              eb, i, &key);
2101                         if (ret)
2102                                 break;
2103                 } else if (key.type == BTRFS_INODE_REF_KEY ||
2104                            key.type == BTRFS_INODE_EXTREF_KEY) {
2105                         ret = add_inode_ref(wc->trans, root, log, path,
2106                                             eb, i, &key);
2107                         if (ret && ret != -ENOENT)
2108                                 break;
2109                         ret = 0;
2110                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2111                         ret = replay_one_extent(wc->trans, root, path,
2112                                                 eb, i, &key);
2113                         if (ret)
2114                                 break;
2115                 } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2116                         ret = replay_one_dir_item(wc->trans, root, path,
2117                                                   eb, i, &key);
2118                         if (ret)
2119                                 break;
2120                 }
2121         }
2122         btrfs_free_path(path);
2123         return ret;
2124 }
2125
2126 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2127                                    struct btrfs_root *root,
2128                                    struct btrfs_path *path, int *level,
2129                                    struct walk_control *wc)
2130 {
2131         u64 root_owner;
2132         u64 bytenr;
2133         u64 ptr_gen;
2134         struct extent_buffer *next;
2135         struct extent_buffer *cur;
2136         struct extent_buffer *parent;
2137         u32 blocksize;
2138         int ret = 0;
2139
2140         WARN_ON(*level < 0);
2141         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2142
2143         while (*level > 0) {
2144                 WARN_ON(*level < 0);
2145                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2146                 cur = path->nodes[*level];
2147
2148                 WARN_ON(btrfs_header_level(cur) != *level);
2149
2150                 if (path->slots[*level] >=
2151                     btrfs_header_nritems(cur))
2152                         break;
2153
2154                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2155                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2156                 blocksize = btrfs_level_size(root, *level - 1);
2157
2158                 parent = path->nodes[*level];
2159                 root_owner = btrfs_header_owner(parent);
2160
2161                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
2162                 if (!next)
2163                         return -ENOMEM;
2164
2165                 if (*level == 1) {
2166                         ret = wc->process_func(root, next, wc, ptr_gen);
2167                         if (ret) {
2168                                 free_extent_buffer(next);
2169                                 return ret;
2170                         }
2171
2172                         path->slots[*level]++;
2173                         if (wc->free) {
2174                                 ret = btrfs_read_buffer(next, ptr_gen);
2175                                 if (ret) {
2176                                         free_extent_buffer(next);
2177                                         return ret;
2178                                 }
2179
2180                                 if (trans) {
2181                                         btrfs_tree_lock(next);
2182                                         btrfs_set_lock_blocking(next);
2183                                         clean_tree_block(trans, root, next);
2184                                         btrfs_wait_tree_block_writeback(next);
2185                                         btrfs_tree_unlock(next);
2186                                 }
2187
2188                                 WARN_ON(root_owner !=
2189                                         BTRFS_TREE_LOG_OBJECTID);
2190                                 ret = btrfs_free_and_pin_reserved_extent(root,
2191                                                          bytenr, blocksize);
2192                                 if (ret) {
2193                                         free_extent_buffer(next);
2194                                         return ret;
2195                                 }
2196                         }
2197                         free_extent_buffer(next);
2198                         continue;
2199                 }
2200                 ret = btrfs_read_buffer(next, ptr_gen);
2201                 if (ret) {
2202                         free_extent_buffer(next);
2203                         return ret;
2204                 }
2205
2206                 WARN_ON(*level <= 0);
2207                 if (path->nodes[*level-1])
2208                         free_extent_buffer(path->nodes[*level-1]);
2209                 path->nodes[*level-1] = next;
2210                 *level = btrfs_header_level(next);
2211                 path->slots[*level] = 0;
2212                 cond_resched();
2213         }
2214         WARN_ON(*level < 0);
2215         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2216
2217         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2218
2219         cond_resched();
2220         return 0;
2221 }
2222
2223 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2224                                  struct btrfs_root *root,
2225                                  struct btrfs_path *path, int *level,
2226                                  struct walk_control *wc)
2227 {
2228         u64 root_owner;
2229         int i;
2230         int slot;
2231         int ret;
2232
2233         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2234                 slot = path->slots[i];
2235                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2236                         path->slots[i]++;
2237                         *level = i;
2238                         WARN_ON(*level == 0);
2239                         return 0;
2240                 } else {
2241                         struct extent_buffer *parent;
2242                         if (path->nodes[*level] == root->node)
2243                                 parent = path->nodes[*level];
2244                         else
2245                                 parent = path->nodes[*level + 1];
2246
2247                         root_owner = btrfs_header_owner(parent);
2248                         ret = wc->process_func(root, path->nodes[*level], wc,
2249                                  btrfs_header_generation(path->nodes[*level]));
2250                         if (ret)
2251                                 return ret;
2252
2253                         if (wc->free) {
2254                                 struct extent_buffer *next;
2255
2256                                 next = path->nodes[*level];
2257
2258                                 if (trans) {
2259                                         btrfs_tree_lock(next);
2260                                         btrfs_set_lock_blocking(next);
2261                                         clean_tree_block(trans, root, next);
2262                                         btrfs_wait_tree_block_writeback(next);
2263                                         btrfs_tree_unlock(next);
2264                                 }
2265
2266                                 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2267                                 ret = btrfs_free_and_pin_reserved_extent(root,
2268                                                 path->nodes[*level]->start,
2269                                                 path->nodes[*level]->len);
2270                                 if (ret)
2271                                         return ret;
2272                         }
2273                         free_extent_buffer(path->nodes[*level]);
2274                         path->nodes[*level] = NULL;
2275                         *level = i + 1;
2276                 }
2277         }
2278         return 1;
2279 }
2280
2281 /*
2282  * drop the reference count on the tree rooted at 'snap'.  This traverses
2283  * the tree freeing any blocks that have a ref count of zero after being
2284  * decremented.
2285  */
2286 static int walk_log_tree(struct btrfs_trans_handle *trans,
2287                          struct btrfs_root *log, struct walk_control *wc)
2288 {
2289         int ret = 0;
2290         int wret;
2291         int level;
2292         struct btrfs_path *path;
2293         int orig_level;
2294
2295         path = btrfs_alloc_path();
2296         if (!path)
2297                 return -ENOMEM;
2298
2299         level = btrfs_header_level(log->node);
2300         orig_level = level;
2301         path->nodes[level] = log->node;
2302         extent_buffer_get(log->node);
2303         path->slots[level] = 0;
2304
2305         while (1) {
2306                 wret = walk_down_log_tree(trans, log, path, &level, wc);
2307                 if (wret > 0)
2308                         break;
2309                 if (wret < 0) {
2310                         ret = wret;
2311                         goto out;
2312                 }
2313
2314                 wret = walk_up_log_tree(trans, log, path, &level, wc);
2315                 if (wret > 0)
2316                         break;
2317                 if (wret < 0) {
2318                         ret = wret;
2319                         goto out;
2320                 }
2321         }
2322
2323         /* was the root node processed? if not, catch it here */
2324         if (path->nodes[orig_level]) {
2325                 ret = wc->process_func(log, path->nodes[orig_level], wc,
2326                          btrfs_header_generation(path->nodes[orig_level]));
2327                 if (ret)
2328                         goto out;
2329                 if (wc->free) {
2330                         struct extent_buffer *next;
2331
2332                         next = path->nodes[orig_level];
2333
2334                         if (trans) {
2335                                 btrfs_tree_lock(next);
2336                                 btrfs_set_lock_blocking(next);
2337                                 clean_tree_block(trans, log, next);
2338                                 btrfs_wait_tree_block_writeback(next);
2339                                 btrfs_tree_unlock(next);
2340                         }
2341
2342                         WARN_ON(log->root_key.objectid !=
2343                                 BTRFS_TREE_LOG_OBJECTID);
2344                         ret = btrfs_free_and_pin_reserved_extent(log, next->start,
2345                                                          next->len);
2346                         if (ret)
2347                                 goto out;
2348                 }
2349         }
2350
2351 out:
2352         btrfs_free_path(path);
2353         return ret;
2354 }
2355
2356 /*
2357  * helper function to update the item for a given subvolumes log root
2358  * in the tree of log roots
2359  */
2360 static int update_log_root(struct btrfs_trans_handle *trans,
2361                            struct btrfs_root *log)
2362 {
2363         int ret;
2364
2365         if (log->log_transid == 1) {
2366                 /* insert root item on the first sync */
2367                 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
2368                                 &log->root_key, &log->root_item);
2369         } else {
2370                 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2371                                 &log->root_key, &log->root_item);
2372         }
2373         return ret;
2374 }
2375
2376 static void wait_log_commit(struct btrfs_trans_handle *trans,
2377                             struct btrfs_root *root, int transid)
2378 {
2379         DEFINE_WAIT(wait);
2380         int index = transid % 2;
2381
2382         /*
2383          * we only allow two pending log transactions at a time,
2384          * so we know that if ours is more than 2 older than the
2385          * current transaction, we're done
2386          */
2387         do {
2388                 prepare_to_wait(&root->log_commit_wait[index],
2389                                 &wait, TASK_UNINTERRUPTIBLE);
2390                 mutex_unlock(&root->log_mutex);
2391
2392                 if (root->log_transid_committed < transid &&
2393                     atomic_read(&root->log_commit[index]))
2394                         schedule();
2395
2396                 finish_wait(&root->log_commit_wait[index], &wait);
2397                 mutex_lock(&root->log_mutex);
2398         } while (root->log_transid_committed < transid &&
2399                  atomic_read(&root->log_commit[index]));
2400 }
2401
2402 static void wait_for_writer(struct btrfs_trans_handle *trans,
2403                             struct btrfs_root *root)
2404 {
2405         DEFINE_WAIT(wait);
2406
2407         while (atomic_read(&root->log_writers)) {
2408                 prepare_to_wait(&root->log_writer_wait,
2409                                 &wait, TASK_UNINTERRUPTIBLE);
2410                 mutex_unlock(&root->log_mutex);
2411                 if (atomic_read(&root->log_writers))
2412                         schedule();
2413                 mutex_lock(&root->log_mutex);
2414                 finish_wait(&root->log_writer_wait, &wait);
2415         }
2416 }
2417
2418 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2419                                         struct btrfs_log_ctx *ctx)
2420 {
2421         if (!ctx)
2422                 return;
2423
2424         mutex_lock(&root->log_mutex);
2425         list_del_init(&ctx->list);
2426         mutex_unlock(&root->log_mutex);
2427 }
2428
2429 /* 
2430  * Invoked in log mutex context, or be sure there is no other task which
2431  * can access the list.
2432  */
2433 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2434                                              int index, int error)
2435 {
2436         struct btrfs_log_ctx *ctx;
2437
2438         if (!error) {
2439                 INIT_LIST_HEAD(&root->log_ctxs[index]);
2440                 return;
2441         }
2442
2443         list_for_each_entry(ctx, &root->log_ctxs[index], list)
2444                 ctx->log_ret = error;
2445
2446         INIT_LIST_HEAD(&root->log_ctxs[index]);
2447 }
2448
2449 /*
2450  * btrfs_sync_log does sends a given tree log down to the disk and
2451  * updates the super blocks to record it.  When this call is done,
2452  * you know that any inodes previously logged are safely on disk only
2453  * if it returns 0.
2454  *
2455  * Any other return value means you need to call btrfs_commit_transaction.
2456  * Some of the edge cases for fsyncing directories that have had unlinks
2457  * or renames done in the past mean that sometimes the only safe
2458  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2459  * that has happened.
2460  */
2461 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2462                    struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2463 {
2464         int index1;
2465         int index2;
2466         int mark;
2467         int ret;
2468         struct btrfs_root *log = root->log_root;
2469         struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2470         int log_transid = 0;
2471         struct btrfs_log_ctx root_log_ctx;
2472         struct blk_plug plug;
2473
2474         mutex_lock(&root->log_mutex);
2475         log_transid = ctx->log_transid;
2476         if (root->log_transid_committed >= log_transid) {
2477                 mutex_unlock(&root->log_mutex);
2478                 return ctx->log_ret;
2479         }
2480
2481         index1 = log_transid % 2;
2482         if (atomic_read(&root->log_commit[index1])) {
2483                 wait_log_commit(trans, root, log_transid);
2484                 mutex_unlock(&root->log_mutex);
2485                 return ctx->log_ret;
2486         }
2487         ASSERT(log_transid == root->log_transid);
2488         atomic_set(&root->log_commit[index1], 1);
2489
2490         /* wait for previous tree log sync to complete */
2491         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2492                 wait_log_commit(trans, root, log_transid - 1);
2493
2494         while (1) {
2495                 int batch = atomic_read(&root->log_batch);
2496                 /* when we're on an ssd, just kick the log commit out */
2497                 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2498                         mutex_unlock(&root->log_mutex);
2499                         schedule_timeout_uninterruptible(1);
2500                         mutex_lock(&root->log_mutex);
2501                 }
2502                 wait_for_writer(trans, root);
2503                 if (batch == atomic_read(&root->log_batch))
2504                         break;
2505         }
2506
2507         /* bail out if we need to do a full commit */
2508         if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2509             trans->transid) {
2510                 ret = -EAGAIN;
2511                 btrfs_free_logged_extents(log, log_transid);
2512                 mutex_unlock(&root->log_mutex);
2513                 goto out;
2514         }
2515
2516         if (log_transid % 2 == 0)
2517                 mark = EXTENT_DIRTY;
2518         else
2519                 mark = EXTENT_NEW;
2520
2521         /* we start IO on  all the marked extents here, but we don't actually
2522          * wait for them until later.
2523          */
2524         blk_start_plug(&plug);
2525         ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2526         if (ret) {
2527                 blk_finish_plug(&plug);
2528                 btrfs_abort_transaction(trans, root, ret);
2529                 btrfs_free_logged_extents(log, log_transid);
2530                 mutex_unlock(&root->log_mutex);
2531                 goto out;
2532         }
2533
2534         btrfs_set_root_node(&log->root_item, log->node);
2535
2536         root->log_transid++;
2537         log->log_transid = root->log_transid;
2538         root->log_start_pid = 0;
2539         /*
2540          * IO has been started, blocks of the log tree have WRITTEN flag set
2541          * in their headers. new modifications of the log will be written to
2542          * new positions. so it's safe to allow log writers to go in.
2543          */
2544         mutex_unlock(&root->log_mutex);
2545
2546         btrfs_init_log_ctx(&root_log_ctx);
2547
2548         mutex_lock(&log_root_tree->log_mutex);
2549         atomic_inc(&log_root_tree->log_batch);
2550         atomic_inc(&log_root_tree->log_writers);
2551
2552         index2 = log_root_tree->log_transid % 2;
2553         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2554         root_log_ctx.log_transid = log_root_tree->log_transid;
2555
2556         mutex_unlock(&log_root_tree->log_mutex);
2557
2558         ret = update_log_root(trans, log);
2559
2560         mutex_lock(&log_root_tree->log_mutex);
2561         if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2562                 smp_mb();
2563                 if (waitqueue_active(&log_root_tree->log_writer_wait))
2564                         wake_up(&log_root_tree->log_writer_wait);
2565         }
2566
2567         if (ret) {
2568                 if (!list_empty(&root_log_ctx.list))
2569                         list_del_init(&root_log_ctx.list);
2570
2571                 blk_finish_plug(&plug);
2572                 if (ret != -ENOSPC) {
2573                         btrfs_abort_transaction(trans, root, ret);
2574                         mutex_unlock(&log_root_tree->log_mutex);
2575                         goto out;
2576                 }
2577                 ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
2578                                                                 trans->transid;
2579                 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2580                 btrfs_free_logged_extents(log, log_transid);
2581                 mutex_unlock(&log_root_tree->log_mutex);
2582                 ret = -EAGAIN;
2583                 goto out;
2584         }
2585
2586         if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2587                 mutex_unlock(&log_root_tree->log_mutex);
2588                 ret = root_log_ctx.log_ret;
2589                 goto out;
2590         }
2591
2592         index2 = root_log_ctx.log_transid % 2;
2593         if (atomic_read(&log_root_tree->log_commit[index2])) {
2594                 blk_finish_plug(&plug);
2595                 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2596                 wait_log_commit(trans, log_root_tree,
2597                                 root_log_ctx.log_transid);
2598                 btrfs_free_logged_extents(log, log_transid);
2599                 mutex_unlock(&log_root_tree->log_mutex);
2600                 ret = root_log_ctx.log_ret;
2601                 goto out;
2602         }
2603         ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2604         atomic_set(&log_root_tree->log_commit[index2], 1);
2605
2606         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2607                 wait_log_commit(trans, log_root_tree,
2608                                 root_log_ctx.log_transid - 1);
2609         }
2610
2611         wait_for_writer(trans, log_root_tree);
2612
2613         /*
2614          * now that we've moved on to the tree of log tree roots,
2615          * check the full commit flag again
2616          */
2617         if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
2618             trans->transid) {
2619                 blk_finish_plug(&plug);
2620                 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2621                 btrfs_free_logged_extents(log, log_transid);
2622                 mutex_unlock(&log_root_tree->log_mutex);
2623                 ret = -EAGAIN;
2624                 goto out_wake_log_root;
2625         }
2626
2627         ret = btrfs_write_marked_extents(log_root_tree,
2628                                          &log_root_tree->dirty_log_pages,
2629                                          EXTENT_DIRTY | EXTENT_NEW);
2630         blk_finish_plug(&plug);
2631         if (ret) {
2632                 btrfs_abort_transaction(trans, root, ret);
2633                 btrfs_free_logged_extents(log, log_transid);
2634                 mutex_unlock(&log_root_tree->log_mutex);
2635                 goto out_wake_log_root;
2636         }
2637         btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2638         btrfs_wait_marked_extents(log_root_tree,
2639                                   &log_root_tree->dirty_log_pages,
2640                                   EXTENT_NEW | EXTENT_DIRTY);
2641         btrfs_wait_logged_extents(log, log_transid);
2642
2643         btrfs_set_super_log_root(root->fs_info->super_for_commit,
2644                                 log_root_tree->node->start);
2645         btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2646                                 btrfs_header_level(log_root_tree->node));
2647
2648         log_root_tree->log_transid++;
2649         mutex_unlock(&log_root_tree->log_mutex);
2650
2651         /*
2652          * nobody else is going to jump in and write the the ctree
2653          * super here because the log_commit atomic below is protecting
2654          * us.  We must be called with a transaction handle pinning
2655          * the running transaction open, so a full commit can't hop
2656          * in and cause problems either.
2657          */
2658         ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2659         if (ret) {
2660                 btrfs_abort_transaction(trans, root, ret);
2661                 goto out_wake_log_root;
2662         }
2663
2664         mutex_lock(&root->log_mutex);
2665         if (root->last_log_commit < log_transid)
2666                 root->last_log_commit = log_transid;
2667         mutex_unlock(&root->log_mutex);
2668
2669 out_wake_log_root:
2670         /*
2671          * We needn't get log_mutex here because we are sure all
2672          * the other tasks are blocked.
2673          */
2674         btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2675
2676         mutex_lock(&log_root_tree->log_mutex);
2677         log_root_tree->log_transid_committed++;
2678         atomic_set(&log_root_tree->log_commit[index2], 0);
2679         mutex_unlock(&log_root_tree->log_mutex);
2680
2681         if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2682                 wake_up(&log_root_tree->log_commit_wait[index2]);
2683 out:
2684         /* See above. */
2685         btrfs_remove_all_log_ctxs(root, index1, ret);
2686
2687         mutex_lock(&root->log_mutex);
2688         root->log_transid_committed++;
2689         atomic_set(&root->log_commit[index1], 0);
2690         mutex_unlock(&root->log_mutex);
2691
2692         if (waitqueue_active(&root->log_commit_wait[index1]))
2693                 wake_up(&root->log_commit_wait[index1]);
2694         return ret;
2695 }
2696
2697 static void free_log_tree(struct btrfs_trans_handle *trans,
2698                           struct btrfs_root *log)
2699 {
2700         int ret;
2701         u64 start;
2702         u64 end;
2703         struct walk_control wc = {
2704                 .free = 1,
2705                 .process_func = process_one_buffer
2706         };
2707
2708         ret = walk_log_tree(trans, log, &wc);
2709         /* I don't think this can happen but just in case */
2710         if (ret)
2711                 btrfs_abort_transaction(trans, log, ret);
2712
2713         while (1) {
2714                 ret = find_first_extent_bit(&log->dirty_log_pages,
2715                                 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2716                                 NULL);
2717                 if (ret)
2718                         break;
2719
2720                 clear_extent_bits(&log->dirty_log_pages, start, end,
2721                                   EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2722         }
2723
2724         /*
2725          * We may have short-circuited the log tree with the full commit logic
2726          * and left ordered extents on our list, so clear these out to keep us
2727          * from leaking inodes and memory.
2728          */
2729         btrfs_free_logged_extents(log, 0);
2730         btrfs_free_logged_extents(log, 1);
2731
2732         free_extent_buffer(log->node);
2733         kfree(log);
2734 }
2735
2736 /*
2737  * free all the extents used by the tree log.  This should be called
2738  * at commit time of the full transaction
2739  */
2740 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2741 {
2742         if (root->log_root) {
2743                 free_log_tree(trans, root->log_root);
2744                 root->log_root = NULL;
2745         }
2746         return 0;
2747 }
2748
2749 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2750                              struct btrfs_fs_info *fs_info)
2751 {
2752         if (fs_info->log_root_tree) {
2753                 free_log_tree(trans, fs_info->log_root_tree);
2754                 fs_info->log_root_tree = NULL;
2755         }
2756         return 0;
2757 }
2758
2759 /*
2760  * If both a file and directory are logged, and unlinks or renames are
2761  * mixed in, we have a few interesting corners:
2762  *
2763  * create file X in dir Y
2764  * link file X to X.link in dir Y
2765  * fsync file X
2766  * unlink file X but leave X.link
2767  * fsync dir Y
2768  *
2769  * After a crash we would expect only X.link to exist.  But file X
2770  * didn't get fsync'd again so the log has back refs for X and X.link.
2771  *
2772  * We solve this by removing directory entries and inode backrefs from the
2773  * log when a file that was logged in the current transaction is
2774  * unlinked.  Any later fsync will include the updated log entries, and
2775  * we'll be able to reconstruct the proper directory items from backrefs.
2776  *
2777  * This optimizations allows us to avoid relogging the entire inode
2778  * or the entire directory.
2779  */
2780 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2781                                  struct btrfs_root *root,
2782                                  const char *name, int name_len,
2783                                  struct inode *dir, u64 index)
2784 {
2785         struct btrfs_root *log;
2786         struct btrfs_dir_item *di;
2787         struct btrfs_path *path;
2788         int ret;
2789         int err = 0;
2790         int bytes_del = 0;
2791         u64 dir_ino = btrfs_ino(dir);
2792
2793         if (BTRFS_I(dir)->logged_trans < trans->transid)
2794                 return 0;
2795
2796         ret = join_running_log_trans(root);
2797         if (ret)
2798                 return 0;
2799
2800         mutex_lock(&BTRFS_I(dir)->log_mutex);
2801
2802         log = root->log_root;
2803         path = btrfs_alloc_path();
2804         if (!path) {
2805                 err = -ENOMEM;
2806                 goto out_unlock;
2807         }
2808
2809         di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
2810                                    name, name_len, -1);
2811         if (IS_ERR(di)) {
2812                 err = PTR_ERR(di);
2813                 goto fail;
2814         }
2815         if (di) {
2816                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2817                 bytes_del += name_len;
2818                 if (ret) {
2819                         err = ret;
2820                         goto fail;
2821                 }
2822         }
2823         btrfs_release_path(path);
2824         di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
2825                                          index, name, name_len, -1);
2826         if (IS_ERR(di)) {
2827                 err = PTR_ERR(di);
2828                 goto fail;
2829         }
2830         if (di) {
2831                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2832                 bytes_del += name_len;
2833                 if (ret) {
2834                         err = ret;
2835                         goto fail;
2836                 }
2837         }
2838
2839         /* update the directory size in the log to reflect the names
2840          * we have removed
2841          */
2842         if (bytes_del) {
2843                 struct btrfs_key key;
2844
2845                 key.objectid = dir_ino;
2846                 key.offset = 0;
2847                 key.type = BTRFS_INODE_ITEM_KEY;
2848                 btrfs_release_path(path);
2849
2850                 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2851                 if (ret < 0) {
2852                         err = ret;
2853                         goto fail;
2854                 }
2855                 if (ret == 0) {
2856                         struct btrfs_inode_item *item;
2857                         u64 i_size;
2858
2859                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2860                                               struct btrfs_inode_item);
2861                         i_size = btrfs_inode_size(path->nodes[0], item);
2862                         if (i_size > bytes_del)
2863                                 i_size -= bytes_del;
2864                         else
2865                                 i_size = 0;
2866                         btrfs_set_inode_size(path->nodes[0], item, i_size);
2867                         btrfs_mark_buffer_dirty(path->nodes[0]);
2868                 } else
2869                         ret = 0;
2870                 btrfs_release_path(path);
2871         }
2872 fail:
2873         btrfs_free_path(path);
2874 out_unlock:
2875         mutex_unlock(&BTRFS_I(dir)->log_mutex);
2876         if (ret == -ENOSPC) {
2877                 root->fs_info->last_trans_log_full_commit = trans->transid;
2878                 ret = 0;
2879         } else if (ret < 0)
2880                 btrfs_abort_transaction(trans, root, ret);
2881
2882         btrfs_end_log_trans(root);
2883
2884         return err;
2885 }
2886
2887 /* see comments for btrfs_del_dir_entries_in_log */
2888 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2889                                struct btrfs_root *root,
2890                                const char *name, int name_len,
2891                                struct inode *inode, u64 dirid)
2892 {
2893         struct btrfs_root *log;
2894         u64 index;
2895         int ret;
2896
2897         if (BTRFS_I(inode)->logged_trans < trans->transid)
2898                 return 0;
2899
2900         ret = join_running_log_trans(root);
2901         if (ret)
2902                 return 0;
2903         log = root->log_root;
2904         mutex_lock(&BTRFS_I(inode)->log_mutex);
2905
2906         ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
2907                                   dirid, &index);
2908         mutex_unlock(&BTRFS_I(inode)->log_mutex);
2909         if (ret == -ENOSPC) {
2910                 root->fs_info->last_trans_log_full_commit = trans->transid;
2911                 ret = 0;
2912         } else if (ret < 0 && ret != -ENOENT)
2913                 btrfs_abort_transaction(trans, root, ret);
2914         btrfs_end_log_trans(root);
2915
2916         return ret;
2917 }
2918
2919 /*
2920  * creates a range item in the log for 'dirid'.  first_offset and
2921  * last_offset tell us which parts of the key space the log should
2922  * be considered authoritative for.
2923  */
2924 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2925                                        struct btrfs_root *log,
2926                                        struct btrfs_path *path,
2927                                        int key_type, u64 dirid,
2928                                        u64 first_offset, u64 last_offset)
2929 {
2930         int ret;
2931         struct btrfs_key key;
2932         struct btrfs_dir_log_item *item;
2933
2934         key.objectid = dirid;
2935         key.offset = first_offset;
2936         if (key_type == BTRFS_DIR_ITEM_KEY)
2937                 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2938         else
2939                 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2940         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2941         if (ret)
2942                 return ret;
2943
2944         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2945                               struct btrfs_dir_log_item);
2946         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2947         btrfs_mark_buffer_dirty(path->nodes[0]);
2948         btrfs_release_path(path);
2949         return 0;
2950 }
2951
2952 /*
2953  * log all the items included in the current transaction for a given
2954  * directory.  This also creates the range items in the log tree required
2955  * to replay anything deleted before the fsync
2956  */
2957 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2958                           struct btrfs_root *root, struct inode *inode,
2959                           struct btrfs_path *path,
2960                           struct btrfs_path *dst_path, int key_type,
2961                           u64 min_offset, u64 *last_offset_ret)
2962 {
2963         struct btrfs_key min_key;
2964         struct btrfs_root *log = root->log_root;
2965         struct extent_buffer *src;
2966         int err = 0;
2967         int ret;
2968         int i;
2969         int nritems;
2970         u64 first_offset = min_offset;
2971         u64 last_offset = (u64)-1;
2972         u64 ino = btrfs_ino(inode);
2973
2974         log = root->log_root;
2975
2976         min_key.objectid = ino;
2977         min_key.type = key_type;
2978         min_key.offset = min_offset;
2979
2980         path->keep_locks = 1;
2981
2982         ret = btrfs_search_forward(root, &min_key, path, trans->transid);
2983
2984         /*
2985          * we didn't find anything from this transaction, see if there
2986          * is anything at all
2987          */
2988         if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
2989                 min_key.objectid = ino;
2990                 min_key.type = key_type;
2991                 min_key.offset = (u64)-1;
2992                 btrfs_release_path(path);
2993                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2994                 if (ret < 0) {
2995                         btrfs_release_path(path);
2996                         return ret;
2997                 }
2998                 ret = btrfs_previous_item(root, path, ino, key_type);
2999
3000                 /* if ret == 0 there are items for this type,
3001                  * create a range to tell us the last key of this type.
3002                  * otherwise, there are no items in this directory after
3003                  * *min_offset, and we create a range to indicate that.
3004                  */
3005                 if (ret == 0) {
3006                         struct btrfs_key tmp;
3007                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3008                                               path->slots[0]);
3009                         if (key_type == tmp.type)
3010                                 first_offset = max(min_offset, tmp.offset) + 1;
3011                 }
3012                 goto done;
3013         }
3014
3015         /* go backward to find any previous key */
3016         ret = btrfs_previous_item(root, path, ino, key_type);
3017         if (ret == 0) {
3018                 struct btrfs_key tmp;
3019                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3020                 if (key_type == tmp.type) {
3021                         first_offset = tmp.offset;
3022                         ret = overwrite_item(trans, log, dst_path,
3023                                              path->nodes[0], path->slots[0],
3024                                              &tmp);
3025                         if (ret) {
3026                                 err = ret;
3027                                 goto done;
3028                         }
3029                 }
3030         }
3031         btrfs_release_path(path);
3032
3033         /* find the first key from this transaction again */
3034         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3035         if (WARN_ON(ret != 0))
3036                 goto done;
3037
3038         /*
3039          * we have a block from this transaction, log every item in it
3040          * from our directory
3041          */
3042         while (1) {
3043                 struct btrfs_key tmp;
3044                 src = path->nodes[0];
3045                 nritems = btrfs_header_nritems(src);
3046                 for (i = path->slots[0]; i < nritems; i++) {
3047                         btrfs_item_key_to_cpu(src, &min_key, i);
3048
3049                         if (min_key.objectid != ino || min_key.type != key_type)
3050                                 goto done;
3051                         ret = overwrite_item(trans, log, dst_path, src, i,
3052                                              &min_key);
3053                         if (ret) {
3054                                 err = ret;
3055                                 goto done;
3056                         }
3057                 }
3058                 path->slots[0] = nritems;
3059
3060                 /*
3061                  * look ahead to the next item and see if it is also
3062                  * from this directory and from this transaction
3063                  */
3064                 ret = btrfs_next_leaf(root, path);
3065                 if (ret == 1) {
3066                         last_offset = (u64)-1;
3067                         goto done;
3068                 }
3069                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3070                 if (tmp.objectid != ino || tmp.type != key_type) {
3071                         last_offset = (u64)-1;
3072                         goto done;
3073                 }
3074                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3075                         ret = overwrite_item(trans, log, dst_path,
3076                                              path->nodes[0], path->slots[0],
3077                                              &tmp);
3078                         if (ret)
3079                                 err = ret;
3080                         else
3081                                 last_offset = tmp.offset;
3082                         goto done;
3083                 }
3084         }
3085 done:
3086         btrfs_release_path(path);
3087         btrfs_release_path(dst_path);
3088
3089         if (err == 0) {
3090                 *last_offset_ret = last_offset;
3091                 /*
3092                  * insert the log range keys to indicate where the log
3093                  * is valid
3094                  */
3095                 ret = insert_dir_log_key(trans, log, path, key_type,
3096                                          ino, first_offset, last_offset);
3097                 if (ret)
3098                         err = ret;
3099         }
3100         return err;
3101 }
3102
3103 /*
3104  * logging directories is very similar to logging inodes, We find all the items
3105  * from the current transaction and write them to the log.
3106  *
3107  * The recovery code scans the directory in the subvolume, and if it finds a
3108  * key in the range logged that is not present in the log tree, then it means
3109  * that dir entry was unlinked during the transaction.
3110  *
3111  * In order for that scan to work, we must include one key smaller than
3112  * the smallest logged by this transaction and one key larger than the largest
3113  * key logged by this transaction.
3114  */
3115 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3116                           struct btrfs_root *root, struct inode *inode,
3117                           struct btrfs_path *path,
3118                           struct btrfs_path *dst_path)
3119 {
3120         u64 min_key;
3121         u64 max_key;
3122         int ret;
3123         int key_type = BTRFS_DIR_ITEM_KEY;
3124
3125 again:
3126         min_key = 0;
3127         max_key = 0;
3128         while (1) {
3129                 ret = log_dir_items(trans, root, inode, path,
3130                                     dst_path, key_type, min_key,
3131                                     &max_key);
3132                 if (ret)
3133                         return ret;
3134                 if (max_key == (u64)-1)
3135                         break;
3136                 min_key = max_key + 1;
3137         }
3138
3139         if (key_type == BTRFS_DIR_ITEM_KEY) {
3140                 key_type = BTRFS_DIR_INDEX_KEY;
3141                 goto again;
3142         }
3143         return 0;
3144 }
3145
3146 /*
3147  * a helper function to drop items from the log before we relog an
3148  * inode.  max_key_type indicates the highest item type to remove.
3149  * This cannot be run for file data extents because it does not
3150  * free the extents they point to.
3151  */
3152 static int drop_objectid_items(struct btrfs_trans_handle *trans,
3153                                   struct btrfs_root *log,
3154                                   struct btrfs_path *path,
3155                                   u64 objectid, int max_key_type)
3156 {
3157         int ret;
3158         struct btrfs_key key;
3159         struct btrfs_key found_key;
3160         int start_slot;
3161
3162         key.objectid = objectid;
3163         key.type = max_key_type;
3164         key.offset = (u64)-1;
3165
3166         while (1) {
3167                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3168                 BUG_ON(ret == 0); /* Logic error */
3169                 if (ret < 0)
3170                         break;
3171
3172                 if (path->slots[0] == 0)
3173                         break;
3174
3175                 path->slots[0]--;
3176                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3177                                       path->slots[0]);
3178
3179                 if (found_key.objectid != objectid)
3180                         break;
3181
3182                 found_key.offset = 0;
3183                 found_key.type = 0;
3184                 ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3185                                        &start_slot);
3186
3187                 ret = btrfs_del_items(trans, log, path, start_slot,
3188                                       path->slots[0] - start_slot + 1);
3189                 /*
3190                  * If start slot isn't 0 then we don't need to re-search, we've
3191                  * found the last guy with the objectid in this tree.
3192                  */
3193                 if (ret || start_slot != 0)
3194                         break;
3195                 btrfs_release_path(path);
3196         }
3197         btrfs_release_path(path);
3198         if (ret > 0)
3199                 ret = 0;
3200         return ret;
3201 }
3202
3203 static void fill_inode_item(struct btrfs_trans_handle *trans,
3204                             struct extent_buffer *leaf,
3205                             struct btrfs_inode_item *item,
3206                             struct inode *inode, int log_inode_only)
3207 {
3208         struct btrfs_map_token token;
3209
3210         btrfs_init_map_token(&token);
3211
3212         if (log_inode_only) {
3213                 /* set the generation to zero so the recover code
3214                  * can tell the difference between an logging
3215                  * just to say 'this inode exists' and a logging
3216                  * to say 'update this inode with these values'
3217                  */
3218                 btrfs_set_token_inode_generation(leaf, item, 0, &token);
3219                 btrfs_set_token_inode_size(leaf, item, 0, &token);
3220         } else {
3221                 btrfs_set_token_inode_generation(leaf, item,
3222                                                  BTRFS_I(inode)->generation,
3223                                                  &token);
3224                 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3225         }
3226
3227         btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3228         btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3229         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3230         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3231
3232         btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3233                                      inode->i_atime.tv_sec, &token);
3234         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3235                                       inode->i_atime.tv_nsec, &token);
3236
3237         btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3238                                      inode->i_mtime.tv_sec, &token);
3239         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3240                                       inode->i_mtime.tv_nsec, &token);
3241
3242         btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3243                                      inode->i_ctime.tv_sec, &token);
3244         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3245                                       inode->i_ctime.tv_nsec, &token);
3246
3247         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3248                                      &token);
3249
3250         btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3251         btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3252         btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3253         btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3254         btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3255 }
3256
3257 static int log_inode_item(struct btrfs_trans_handle *trans,
3258                           struct btrfs_root *log, struct btrfs_path *path,
3259                           struct inode *inode)
3260 {
3261         struct btrfs_inode_item *inode_item;
3262         int ret;
3263
3264         ret = btrfs_insert_empty_item(trans, log, path,
3265                                       &BTRFS_I(inode)->location,
3266                                       sizeof(*inode_item));
3267         if (ret && ret != -EEXIST)
3268                 return ret;
3269         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3270                                     struct btrfs_inode_item);
3271         fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
3272         btrfs_release_path(path);
3273         return 0;
3274 }
3275
3276 static noinline int copy_items(struct btrfs_trans_handle *trans,
3277                                struct inode *inode,
3278                                struct btrfs_path *dst_path,
3279                                struct btrfs_path *src_path, u64 *last_extent,
3280                                int start_slot, int nr, int inode_only)
3281 {
3282         unsigned long src_offset;
3283         unsigned long dst_offset;
3284         struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
3285         struct btrfs_file_extent_item *extent;
3286         struct btrfs_inode_item *inode_item;
3287         struct extent_buffer *src = src_path->nodes[0];
3288         struct btrfs_key first_key, last_key, key;
3289         int ret;
3290         struct btrfs_key *ins_keys;
3291         u32 *ins_sizes;
3292         char *ins_data;
3293         int i;
3294         struct list_head ordered_sums;
3295         int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3296         bool has_extents = false;
3297         bool need_find_last_extent = (*last_extent == 0);
3298         bool done = false;
3299
3300         INIT_LIST_HEAD(&ordered_sums);
3301
3302         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3303                            nr * sizeof(u32), GFP_NOFS);
3304         if (!ins_data)
3305                 return -ENOMEM;
3306
3307         first_key.objectid = (u64)-1;
3308
3309         ins_sizes = (u32 *)ins_data;
3310         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3311
3312         for (i = 0; i < nr; i++) {
3313                 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3314                 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3315         }
3316         ret = btrfs_insert_empty_items(trans, log, dst_path,
3317                                        ins_keys, ins_sizes, nr);
3318         if (ret) {
3319                 kfree(ins_data);
3320                 return ret;
3321         }
3322
3323         for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3324                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3325                                                    dst_path->slots[0]);
3326
3327                 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3328
3329                 if ((i == (nr - 1)))
3330                         last_key = ins_keys[i];
3331
3332                 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3333                         inode_item = btrfs_item_ptr(dst_path->nodes[0],
3334                                                     dst_path->slots[0],
3335                                                     struct btrfs_inode_item);
3336                         fill_inode_item(trans, dst_path->nodes[0], inode_item,
3337                                         inode, inode_only == LOG_INODE_EXISTS);
3338                 } else {
3339                         copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3340                                            src_offset, ins_sizes[i]);
3341                 }
3342
3343                 /*
3344                  * We set need_find_last_extent here in case we know we were
3345                  * processing other items and then walk into the first extent in
3346                  * the inode.  If we don't hit an extent then nothing changes,
3347                  * we'll do the last search the next time around.
3348                  */
3349                 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3350                         has_extents = true;
3351                         if (need_find_last_extent &&
3352                             first_key.objectid == (u64)-1)
3353                                 first_key = ins_keys[i];
3354                 } else {
3355                         need_find_last_extent = false;
3356                 }
3357
3358                 /* take a reference on file data extents so that truncates
3359                  * or deletes of this inode don't have to relog the inode
3360                  * again
3361                  */
3362                 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
3363                     !skip_csum) {
3364                         int found_type;
3365                         extent = btrfs_item_ptr(src, start_slot + i,
3366                                                 struct btrfs_file_extent_item);
3367
3368                         if (btrfs_file_extent_generation(src, extent) < trans->transid)
3369                                 continue;
3370
3371                         found_type = btrfs_file_extent_type(src, extent);
3372                         if (found_type == BTRFS_FILE_EXTENT_REG) {
3373                                 u64 ds, dl, cs, cl;
3374                                 ds = btrfs_file_extent_disk_bytenr(src,
3375                                                                 extent);
3376                                 /* ds == 0 is a hole */
3377                                 if (ds == 0)
3378                                         continue;
3379
3380                                 dl = btrfs_file_extent_disk_num_bytes(src,
3381                                                                 extent);
3382                                 cs = btrfs_file_extent_offset(src, extent);
3383                                 cl = btrfs_file_extent_num_bytes(src,
3384                                                                 extent);
3385                                 if (btrfs_file_extent_compression(src,
3386                                                                   extent)) {
3387                                         cs = 0;
3388                                         cl = dl;
3389                                 }
3390
3391                                 ret = btrfs_lookup_csums_range(
3392                                                 log->fs_info->csum_root,
3393                                                 ds + cs, ds + cs + cl - 1,
3394                                                 &ordered_sums, 0);
3395                                 if (ret) {
3396                                         btrfs_release_path(dst_path);
3397                                         kfree(ins_data);
3398                                         return ret;
3399                                 }
3400                         }
3401                 }
3402         }
3403
3404         btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3405         btrfs_release_path(dst_path);
3406         kfree(ins_data);
3407
3408         /*
3409          * we have to do this after the loop above to avoid changing the
3410          * log tree while trying to change the log tree.
3411          */
3412         ret = 0;
3413         while (!list_empty(&ordered_sums)) {
3414                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3415                                                    struct btrfs_ordered_sum,
3416                                                    list);
3417                 if (!ret)
3418                         ret = btrfs_csum_file_blocks(trans, log, sums);
3419                 list_del(&sums->list);
3420                 kfree(sums);
3421         }
3422
3423         if (!has_extents)
3424                 return ret;
3425
3426         /*
3427          * Because we use btrfs_search_forward we could skip leaves that were
3428          * not modified and then assume *last_extent is valid when it really
3429          * isn't.  So back up to the previous leaf and read the end of the last
3430          * extent before we go and fill in holes.
3431          */
3432         if (need_find_last_extent) {
3433                 u64 len;
3434
3435                 ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path);
3436                 if (ret < 0)
3437                         return ret;
3438                 if (ret)
3439                         goto fill_holes;
3440                 if (src_path->slots[0])
3441                         src_path->slots[0]--;
3442                 src = src_path->nodes[0];
3443                 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
3444                 if (key.objectid != btrfs_ino(inode) ||
3445                     key.type != BTRFS_EXTENT_DATA_KEY)
3446                         goto fill_holes;
3447                 extent = btrfs_item_ptr(src, src_path->slots[0],
3448                                         struct btrfs_file_extent_item);
3449                 if (btrfs_file_extent_type(src, extent) ==
3450                     BTRFS_FILE_EXTENT_INLINE) {
3451                         len = btrfs_file_extent_inline_len(src,
3452                                                            src_path->slots[0],
3453                                                            extent);
3454                         *last_extent = ALIGN(key.offset + len,
3455                                              log->sectorsize);
3456                 } else {
3457                         len = btrfs_file_extent_num_bytes(src, extent);
3458                         *last_extent = key.offset + len;
3459                 }
3460         }
3461 fill_holes:
3462         /* So we did prev_leaf, now we need to move to the next leaf, but a few
3463          * things could have happened
3464          *
3465          * 1) A merge could have happened, so we could currently be on a leaf
3466          * that holds what we were copying in the first place.
3467          * 2) A split could have happened, and now not all of the items we want
3468          * are on the same leaf.
3469          *
3470          * So we need to adjust how we search for holes, we need to drop the
3471          * path and re-search for the first extent key we found, and then walk
3472          * forward until we hit the last one we copied.
3473          */
3474         if (need_find_last_extent) {
3475                 /* btrfs_prev_leaf could return 1 without releasing the path */
3476                 btrfs_release_path(src_path);
3477                 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key,
3478                                         src_path, 0, 0);
3479                 if (ret < 0)
3480                         return ret;
3481                 ASSERT(ret == 0);
3482                 src = src_path->nodes[0];
3483                 i = src_path->slots[0];
3484         } else {
3485                 i = start_slot;
3486         }
3487
3488         /*
3489          * Ok so here we need to go through and fill in any holes we may have
3490          * to make sure that holes are punched for those areas in case they had
3491          * extents previously.
3492          */
3493         while (!done) {
3494                 u64 offset, len;
3495                 u64 extent_end;
3496
3497                 if (i >= btrfs_header_nritems(src_path->nodes[0])) {
3498                         ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path);
3499                         if (ret < 0)
3500                                 return ret;
3501                         ASSERT(ret == 0);
3502                         src = src_path->nodes[0];
3503                         i = 0;
3504                 }
3505
3506                 btrfs_item_key_to_cpu(src, &key, i);
3507                 if (!btrfs_comp_cpu_keys(&key, &last_key))
3508                         done = true;
3509                 if (key.objectid != btrfs_ino(inode) ||
3510                     key.type != BTRFS_EXTENT_DATA_KEY) {
3511                         i++;
3512                         continue;
3513                 }
3514                 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
3515                 if (btrfs_file_extent_type(src, extent) ==
3516                     BTRFS_FILE_EXTENT_INLINE) {
3517                         len = btrfs_file_extent_inline_len(src, i, extent);
3518                         extent_end = ALIGN(key.offset + len, log->sectorsize);
3519                 } else {
3520                         len = btrfs_file_extent_num_bytes(src, extent);
3521                         extent_end = key.offset + len;
3522                 }
3523                 i++;
3524
3525                 if (*last_extent == key.offset) {
3526                         *last_extent = extent_end;
3527                         continue;
3528                 }
3529                 offset = *last_extent;
3530                 len = key.offset - *last_extent;
3531                 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
3532                                                offset, 0, 0, len, 0, len, 0,
3533                                                0, 0);
3534                 if (ret)
3535                         break;
3536                 *last_extent = offset + len;
3537         }
3538         /*
3539          * Need to let the callers know we dropped the path so they should
3540          * re-search.
3541          */
3542         if (!ret && need_find_last_extent)
3543                 ret = 1;
3544         return ret;
3545 }
3546
3547 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3548 {
3549         struct extent_map *em1, *em2;
3550
3551         em1 = list_entry(a, struct extent_map, list);
3552         em2 = list_entry(b, struct extent_map, list);
3553
3554         if (em1->start < em2->start)
3555                 return -1;
3556         else if (em1->start > em2->start)
3557                 return 1;
3558         return 0;
3559 }
3560
3561 static int log_one_extent(struct btrfs_trans_handle *trans,
3562                           struct inode *inode, struct btrfs_root *root,
3563                           struct extent_map *em, struct btrfs_path *path,
3564                           struct list_head *logged_list)
3565 {
3566         struct btrfs_root *log = root->log_root;
3567         struct btrfs_file_extent_item *fi;
3568         struct extent_buffer *leaf;
3569         struct btrfs_ordered_extent *ordered;
3570         struct list_head ordered_sums;
3571         struct btrfs_map_token token;
3572         struct btrfs_key key;
3573         u64 mod_start = em->mod_start;
3574         u64 mod_len = em->mod_len;
3575         u64 csum_offset;
3576         u64 csum_len;
3577         u64 extent_offset = em->start - em->orig_start;
3578         u64 block_len;
3579         int ret;
3580         bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3581         int extent_inserted = 0;
3582
3583         INIT_LIST_HEAD(&ordered_sums);
3584         btrfs_init_map_token(&token);
3585
3586         ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3587                                    em->start + em->len, NULL, 0, 1,
3588                                    sizeof(*fi), &extent_inserted);
3589         if (ret)
3590                 return ret;
3591
3592         if (!extent_inserted) {
3593                 key.objectid = btrfs_ino(inode);
3594                 key.type = BTRFS_EXTENT_DATA_KEY;
3595                 key.offset = em->start;
3596
3597                 ret = btrfs_insert_empty_item(trans, log, path, &key,
3598                                               sizeof(*fi));
3599                 if (ret)
3600                         return ret;
3601         }
3602         leaf = path->nodes[0];
3603         fi = btrfs_item_ptr(leaf, path->slots[0],
3604                             struct btrfs_file_extent_item);
3605
3606         btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3607                                                &token);
3608         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3609                 skip_csum = true;
3610                 btrfs_set_token_file_extent_type(leaf, fi,
3611                                                  BTRFS_FILE_EXTENT_PREALLOC,
3612                                                  &token);
3613         } else {
3614                 btrfs_set_token_file_extent_type(leaf, fi,
3615                                                  BTRFS_FILE_EXTENT_REG,
3616                                                  &token);
3617                 if (em->block_start == EXTENT_MAP_HOLE)
3618                         skip_csum = true;
3619         }
3620
3621         block_len = max(em->block_len, em->orig_block_len);
3622         if (em->compress_type != BTRFS_COMPRESS_NONE) {
3623                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3624                                                         em->block_start,
3625                                                         &token);
3626                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3627                                                            &token);
3628         } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3629                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3630                                                         em->block_start -
3631                                                         extent_offset, &token);
3632                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3633                                                            &token);
3634         } else {
3635                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3636                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3637                                                            &token);
3638         }
3639
3640         btrfs_set_token_file_extent_offset(leaf, fi,
3641                                            em->start - em->orig_start,
3642                                            &token);
3643         btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3644         btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3645         btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3646                                                 &token);
3647         btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3648         btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3649         btrfs_mark_buffer_dirty(leaf);
3650
3651         btrfs_release_path(path);
3652         if (ret) {
3653                 return ret;
3654         }
3655
3656         if (skip_csum)
3657                 return 0;
3658
3659         /*
3660          * First check and see if our csums are on our outstanding ordered
3661          * extents.
3662          */
3663         list_for_each_entry(ordered, logged_list, log_list) {
3664                 struct btrfs_ordered_sum *sum;
3665
3666                 if (!mod_len)
3667                         break;
3668
3669                 if (ordered->file_offset + ordered->len <= mod_start ||
3670                     mod_start + mod_len <= ordered->file_offset)
3671                         continue;
3672
3673                 /*
3674                  * We are going to copy all the csums on this ordered extent, so
3675                  * go ahead and adjust mod_start and mod_len in case this
3676                  * ordered extent has already been logged.
3677                  */
3678                 if (ordered->file_offset > mod_start) {
3679                         if (ordered->file_offset + ordered->len >=
3680                             mod_start + mod_len)
3681                                 mod_len = ordered->file_offset - mod_start;
3682                         /*
3683                          * If we have this case
3684                          *
3685                          * |--------- logged extent ---------|
3686                          *       |----- ordered extent ----|
3687                          *
3688                          * Just don't mess with mod_start and mod_len, we'll
3689                          * just end up logging more csums than we need and it
3690                          * will be ok.
3691                          */
3692                 } else {
3693                         if (ordered->file_offset + ordered->len <
3694                             mod_start + mod_len) {
3695                                 mod_len = (mod_start + mod_len) -
3696                                         (ordered->file_offset + ordered->len);
3697                                 mod_start = ordered->file_offset +
3698                                         ordered->len;
3699                         } else {
3700                                 mod_len = 0;
3701                         }
3702                 }
3703
3704                 /*
3705                  * To keep us from looping for the above case of an ordered
3706                  * extent that falls inside of the logged extent.
3707                  */
3708                 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3709                                      &ordered->flags))
3710                         continue;
3711
3712                 if (ordered->csum_bytes_left) {
3713                         btrfs_start_ordered_extent(inode, ordered, 0);
3714                         wait_event(ordered->wait,
3715                                    ordered->csum_bytes_left == 0);
3716                 }
3717
3718                 list_for_each_entry(sum, &ordered->list, list) {
3719                         ret = btrfs_csum_file_blocks(trans, log, sum);
3720                         if (ret)
3721                                 goto unlocked;
3722                 }
3723
3724         }
3725 unlocked:
3726
3727         if (!mod_len || ret)
3728                 return ret;
3729
3730         if (em->compress_type) {
3731                 csum_offset = 0;
3732                 csum_len = block_len;
3733         } else {
3734                 csum_offset = mod_start - em->start;
3735                 csum_len = mod_len;
3736         }
3737
3738         /* block start is already adjusted for the file extent offset. */
3739         ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3740                                        em->block_start + csum_offset,
3741                                        em->block_start + csum_offset +
3742                                        csum_len - 1, &ordered_sums, 0);
3743         if (ret)
3744                 return ret;
3745
3746         while (!list_empty(&ordered_sums)) {
3747                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3748                                                    struct btrfs_ordered_sum,
3749                                                    list);
3750                 if (!ret)
3751                         ret = btrfs_csum_file_blocks(trans, log, sums);
3752                 list_del(&sums->list);
3753                 kfree(sums);
3754         }
3755
3756         return ret;
3757 }
3758
3759 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3760                                      struct btrfs_root *root,
3761                                      struct inode *inode,
3762                                      struct btrfs_path *path,
3763                                      struct list_head *logged_list)
3764 {
3765         struct extent_map *em, *n;
3766         struct list_head extents;
3767         struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3768         u64 test_gen;
3769         int ret = 0;
3770         int num = 0;
3771
3772         INIT_LIST_HEAD(&extents);
3773
3774         write_lock(&tree->lock);
3775         test_gen = root->fs_info->last_trans_committed;
3776
3777         list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3778                 list_del_init(&em->list);
3779
3780                 /*
3781                  * Just an arbitrary number, this can be really CPU intensive
3782                  * once we start getting a lot of extents, and really once we
3783                  * have a bunch of extents we just want to commit since it will
3784                  * be faster.
3785                  */
3786                 if (++num > 32768) {
3787                         list_del_init(&tree->modified_extents);
3788                         ret = -EFBIG;
3789                         goto process;
3790                 }
3791
3792                 if (em->generation <= test_gen)
3793                         continue;
3794                 /* Need a ref to keep it from getting evicted from cache */
3795                 atomic_inc(&em->refs);
3796                 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3797                 list_add_tail(&em->list, &extents);
3798                 num++;
3799         }
3800
3801         list_sort(NULL, &extents, extent_cmp);
3802
3803 process:
3804         while (!list_empty(&extents)) {
3805                 em = list_entry(extents.next, struct extent_map, list);
3806
3807                 list_del_init(&em->list);
3808
3809                 /*
3810                  * If we had an error we just need to delete everybody from our
3811                  * private list.
3812                  */
3813                 if (ret) {
3814                         clear_em_logging(tree, em);
3815                         free_extent_map(em);
3816                         continue;
3817                 }
3818
3819                 write_unlock(&tree->lock);
3820
3821                 ret = log_one_extent(trans, inode, root, em, path, logged_list);
3822                 write_lock(&tree->lock);
3823                 clear_em_logging(tree, em);
3824                 free_extent_map(em);
3825         }
3826         WARN_ON(!list_empty(&extents));
3827         write_unlock(&tree->lock);
3828
3829         btrfs_release_path(path);
3830         return ret;
3831 }
3832
3833 /* log a single inode in the tree log.
3834  * At least one parent directory for this inode must exist in the tree
3835  * or be logged already.
3836  *
3837  * Any items from this inode changed by the current transaction are copied
3838  * to the log tree.  An extra reference is taken on any extents in this
3839  * file, allowing us to avoid a whole pile of corner cases around logging
3840  * blocks that have been removed from the tree.
3841  *
3842  * See LOG_INODE_ALL and related defines for a description of what inode_only
3843  * does.
3844  *
3845  * This handles both files and directories.
3846  */
3847 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3848                              struct btrfs_root *root, struct inode *inode,
3849                              int inode_only)
3850 {
3851         struct btrfs_path *path;
3852         struct btrfs_path *dst_path;
3853         struct btrfs_key min_key;
3854         struct btrfs_key max_key;
3855         struct btrfs_root *log = root->log_root;
3856         struct extent_buffer *src = NULL;
3857         LIST_HEAD(logged_list);
3858         u64 last_extent = 0;
3859         int err = 0;
3860         int ret;
3861         int nritems;
3862         int ins_start_slot = 0;
3863         int ins_nr;
3864         bool fast_search = false;
3865         u64 ino = btrfs_ino(inode);
3866
3867         path = btrfs_alloc_path();
3868         if (!path)
3869                 return -ENOMEM;
3870         dst_path = btrfs_alloc_path();
3871         if (!dst_path) {
3872                 btrfs_free_path(path);
3873                 return -ENOMEM;
3874         }
3875
3876         min_key.objectid = ino;
3877         min_key.type = BTRFS_INODE_ITEM_KEY;
3878         min_key.offset = 0;
3879
3880         max_key.objectid = ino;
3881
3882
3883         /* today the code can only do partial logging of directories */
3884         if (S_ISDIR(inode->i_mode) ||
3885             (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3886                        &BTRFS_I(inode)->runtime_flags) &&
3887              inode_only == LOG_INODE_EXISTS))
3888                 max_key.type = BTRFS_XATTR_ITEM_KEY;
3889         else
3890                 max_key.type = (u8)-1;
3891         max_key.offset = (u64)-1;
3892
3893         /* Only run delayed items if we are a dir or a new file */
3894         if (S_ISDIR(inode->i_mode) ||
3895             BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
3896                 ret = btrfs_commit_inode_delayed_items(trans, inode);
3897                 if (ret) {
3898                         btrfs_free_path(path);
3899                         btrfs_free_path(dst_path);
3900                         return ret;
3901                 }
3902         }
3903
3904         mutex_lock(&BTRFS_I(inode)->log_mutex);
3905
3906         btrfs_get_logged_extents(inode, &logged_list);
3907
3908         /*
3909          * a brute force approach to making sure we get the most uptodate
3910          * copies of everything.
3911          */
3912         if (S_ISDIR(inode->i_mode)) {
3913                 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
3914
3915                 if (inode_only == LOG_INODE_EXISTS)
3916                         max_key_type = BTRFS_XATTR_ITEM_KEY;
3917                 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
3918         } else {
3919                 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3920                                        &BTRFS_I(inode)->runtime_flags)) {
3921                         clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3922                                   &BTRFS_I(inode)->runtime_flags);
3923                         ret = btrfs_truncate_inode_items(trans, log,
3924                                                          inode, 0, 0);
3925                 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3926                                               &BTRFS_I(inode)->runtime_flags) ||
3927                            inode_only == LOG_INODE_EXISTS) {
3928                         if (inode_only == LOG_INODE_ALL)
3929                                 fast_search = true;
3930                         max_key.type = BTRFS_XATTR_ITEM_KEY;
3931                         ret = drop_objectid_items(trans, log, path, ino,
3932                                                   max_key.type);
3933                 } else {
3934                         if (inode_only == LOG_INODE_ALL)
3935                                 fast_search = true;
3936                         ret = log_inode_item(trans, log, dst_path, inode);
3937                         if (ret) {
3938                                 err = ret;
3939                                 goto out_unlock;
3940                         }
3941                         goto log_extents;
3942                 }
3943
3944         }
3945         if (ret) {
3946                 err = ret;
3947                 goto out_unlock;
3948         }
3949         path->keep_locks = 1;
3950
3951         while (1) {
3952                 ins_nr = 0;
3953                 ret = btrfs_search_forward(root, &min_key,
3954                                            path, trans->transid);
3955                 if (ret != 0)
3956                         break;
3957 again:
3958                 /* note, ins_nr might be > 0 here, cleanup outside the loop */
3959                 if (min_key.objectid != ino)
3960                         break;
3961                 if (min_key.type > max_key.type)
3962                         break;
3963
3964                 src = path->nodes[0];
3965                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
3966                         ins_nr++;
3967                         goto next_slot;
3968                 } else if (!ins_nr) {
3969                         ins_start_slot = path->slots[0];
3970                         ins_nr = 1;
3971                         goto next_slot;
3972                 }
3973
3974                 ret = copy_items(trans, inode, dst_path, path, &last_extent,
3975                                  ins_start_slot, ins_nr, inode_only);
3976                 if (ret < 0) {
3977                         err = ret;
3978                         goto out_unlock;
3979                 } if (ret) {
3980                         ins_nr = 0;
3981                         btrfs_release_path(path);
3982                         continue;
3983                 }
3984                 ins_nr = 1;
3985                 ins_start_slot = path->slots[0];
3986 next_slot:
3987
3988                 nritems = btrfs_header_nritems(path->nodes[0]);
3989                 path->slots[0]++;
3990                 if (path->slots[0] < nritems) {
3991                         btrfs_item_key_to_cpu(path->nodes[0], &min_key,
3992                                               path->slots[0]);
3993                         goto again;
3994                 }
3995                 if (ins_nr) {
3996                         ret = copy_items(trans, inode, dst_path, path,
3997                                          &last_extent, ins_start_slot,
3998                                          ins_nr, inode_only);
3999                         if (ret < 0) {
4000                                 err = ret;
4001                                 goto out_unlock;
4002                         }
4003                         ret = 0;
4004                         ins_nr = 0;
4005                 }
4006                 btrfs_release_path(path);
4007
4008                 if (min_key.offset < (u64)-1) {
4009                         min_key.offset++;
4010                 } else if (min_key.type < max_key.type) {
4011                         min_key.type++;
4012                         min_key.offset = 0;
4013                 } else {
4014                         break;
4015                 }
4016         }
4017         if (ins_nr) {
4018                 ret = copy_items(trans, inode, dst_path, path, &last_extent,
4019                                  ins_start_slot, ins_nr, inode_only);
4020                 if (ret < 0) {
4021                         err = ret;
4022                         goto out_unlock;
4023                 }
4024                 ret = 0;
4025                 ins_nr = 0;
4026         }
4027
4028 log_extents:
4029         btrfs_release_path(path);
4030         btrfs_release_path(dst_path);
4031         if (fast_search) {
4032                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4033                                                 &logged_list);
4034                 if (ret) {
4035                         err = ret;
4036                         goto out_unlock;
4037                 }
4038         } else if (inode_only == LOG_INODE_ALL) {
4039                 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
4040                 struct extent_map *em, *n;
4041
4042                 write_lock(&tree->lock);
4043                 list_for_each_entry_safe(em, n, &tree->modified_extents, list)
4044                         list_del_init(&em->list);
4045                 write_unlock(&tree->lock);
4046         }
4047
4048         if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
4049                 ret = log_directory_changes(trans, root, inode, path, dst_path);
4050                 if (ret) {
4051                         err = ret;
4052                         goto out_unlock;
4053                 }
4054         }
4055         BTRFS_I(inode)->logged_trans = trans->transid;
4056         BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
4057 out_unlock:
4058         if (unlikely(err))
4059                 btrfs_put_logged_extents(&logged_list);
4060         else
4061                 btrfs_submit_logged_extents(&logged_list, log);
4062         mutex_unlock(&BTRFS_I(inode)->log_mutex);
4063
4064         btrfs_free_path(path);
4065         btrfs_free_path(dst_path);
4066         return err;
4067 }
4068
4069 /*
4070  * follow the dentry parent pointers up the chain and see if any
4071  * of the directories in it require a full commit before they can
4072  * be logged.  Returns zero if nothing special needs to be done or 1 if
4073  * a full commit is required.
4074  */
4075 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4076                                                struct inode *inode,
4077                                                struct dentry *parent,
4078                                                struct super_block *sb,
4079                                                u64 last_committed)
4080 {
4081         int ret = 0;
4082         struct btrfs_root *root;
4083         struct dentry *old_parent = NULL;
4084         struct inode *orig_inode = inode;
4085
4086         /*
4087          * for regular files, if its inode is already on disk, we don't
4088          * have to worry about the parents at all.  This is because
4089          * we can use the last_unlink_trans field to record renames
4090          * and other fun in this file.
4091          */
4092         if (S_ISREG(inode->i_mode) &&
4093             BTRFS_I(inode)->generation <= last_committed &&
4094             BTRFS_I(inode)->last_unlink_trans <= last_committed)
4095                         goto out;
4096
4097         if (!S_ISDIR(inode->i_mode)) {
4098                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
4099                         goto out;
4100                 inode = parent->d_inode;
4101         }
4102
4103         while (1) {
4104                 /*
4105                  * If we are logging a directory then we start with our inode,
4106                  * not our parents inode, so we need to skipp setting the
4107                  * logged_trans so that further down in the log code we don't
4108                  * think this inode has already been logged.
4109                  */
4110                 if (inode != orig_inode)
4111                         BTRFS_I(inode)->logged_trans = trans->transid;
4112                 smp_mb();
4113
4114                 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
4115                         root = BTRFS_I(inode)->root;
4116
4117                         /*
4118                          * make sure any commits to the log are forced
4119                          * to be full commits
4120                          */
4121                         root->fs_info->last_trans_log_full_commit =
4122                                 trans->transid;
4123                         ret = 1;
4124                         break;
4125                 }
4126
4127                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
4128                         break;
4129
4130                 if (IS_ROOT(parent))
4131                         break;
4132
4133                 parent = dget_parent(parent);
4134                 dput(old_parent);
4135                 old_parent = parent;
4136                 inode = parent->d_inode;
4137
4138         }
4139         dput(old_parent);
4140 out:
4141         return ret;
4142 }
4143
4144 /*
4145  * helper function around btrfs_log_inode to make sure newly created
4146  * parent directories also end up in the log.  A minimal inode and backref
4147  * only logging is done of any parent directories that are older than
4148  * the last committed transaction
4149  */
4150 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4151                                   struct btrfs_root *root, struct inode *inode,
4152                                   struct dentry *parent, int exists_only,
4153                                   struct btrfs_log_ctx *ctx)
4154 {
4155         int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
4156         struct super_block *sb;
4157         struct dentry *old_parent = NULL;
4158         int ret = 0;
4159         u64 last_committed = root->fs_info->last_trans_committed;
4160
4161         sb = inode->i_sb;
4162
4163         if (btrfs_test_opt(root, NOTREELOG)) {
4164                 ret = 1;
4165                 goto end_no_trans;
4166         }
4167
4168         if (root->fs_info->last_trans_log_full_commit >
4169             root->fs_info->last_trans_committed) {
4170                 ret = 1;
4171                 goto end_no_trans;
4172         }
4173
4174         if (root != BTRFS_I(inode)->root ||
4175             btrfs_root_refs(&root->root_item) == 0) {
4176                 ret = 1;
4177                 goto end_no_trans;
4178         }
4179
4180         ret = check_parent_dirs_for_sync(trans, inode, parent,
4181                                          sb, last_committed);
4182         if (ret)
4183                 goto end_no_trans;
4184
4185         if (btrfs_inode_in_log(inode, trans->transid)) {
4186                 ret = BTRFS_NO_LOG_SYNC;
4187                 goto end_no_trans;
4188         }
4189
4190         ret = start_log_trans(trans, root, ctx);
4191         if (ret)
4192                 goto end_no_trans;
4193
4194         ret = btrfs_log_inode(trans, root, inode, inode_only);
4195         if (ret)
4196                 goto end_trans;
4197
4198         /*
4199          * for regular files, if its inode is already on disk, we don't
4200          * have to worry about the parents at all.  This is because
4201          * we can use the last_unlink_trans field to record renames
4202          * and other fun in this file.
4203          */
4204         if (S_ISREG(inode->i_mode) &&
4205             BTRFS_I(inode)->generation <= last_committed &&
4206             BTRFS_I(inode)->last_unlink_trans <= last_committed) {
4207                 ret = 0;
4208                 goto end_trans;
4209         }
4210
4211         inode_only = LOG_INODE_EXISTS;
4212         while (1) {
4213                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
4214                         break;
4215
4216                 inode = parent->d_inode;
4217                 if (root != BTRFS_I(inode)->root)
4218                         break;
4219
4220                 if (BTRFS_I(inode)->generation >
4221                     root->fs_info->last_trans_committed) {
4222                         ret = btrfs_log_inode(trans, root, inode, inode_only);
4223                         if (ret)
4224                                 goto end_trans;
4225                 }
4226                 if (IS_ROOT(parent))
4227                         break;
4228
4229                 parent = dget_parent(parent);
4230                 dput(old_parent);
4231                 old_parent = parent;
4232         }
4233         ret = 0;
4234 end_trans:
4235         dput(old_parent);
4236         if (ret < 0) {
4237                 root->fs_info->last_trans_log_full_commit = trans->transid;
4238                 ret = 1;
4239         }
4240
4241         if (ret)
4242                 btrfs_remove_log_ctx(root, ctx);
4243         btrfs_end_log_trans(root);
4244 end_no_trans:
4245         return ret;
4246 }
4247
4248 /*
4249  * it is not safe to log dentry if the chunk root has added new
4250  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
4251  * If this returns 1, you must commit the transaction to safely get your
4252  * data on disk.
4253  */
4254 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
4255                           struct btrfs_root *root, struct dentry *dentry,
4256                           struct btrfs_log_ctx *ctx)
4257 {
4258         struct dentry *parent = dget_parent(dentry);
4259         int ret;
4260
4261         ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
4262                                      0, ctx);
4263         dput(parent);
4264
4265         return ret;
4266 }
4267
4268 /*
4269  * should be called during mount to recover any replay any log trees
4270  * from the FS
4271  */
4272 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
4273 {
4274         int ret;
4275         struct btrfs_path *path;
4276         struct btrfs_trans_handle *trans;
4277         struct btrfs_key key;
4278         struct btrfs_key found_key;
4279         struct btrfs_key tmp_key;
4280         struct btrfs_root *log;
4281         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
4282         struct walk_control wc = {
4283                 .process_func = process_one_buffer,
4284                 .stage = 0,
4285         };
4286
4287         path = btrfs_alloc_path();
4288         if (!path)
4289                 return -ENOMEM;
4290
4291         fs_info->log_root_recovering = 1;
4292
4293         trans = btrfs_start_transaction(fs_info->tree_root, 0);
4294         if (IS_ERR(trans)) {
4295                 ret = PTR_ERR(trans);
4296                 goto error;
4297         }
4298
4299         wc.trans = trans;
4300         wc.pin = 1;
4301
4302         ret = walk_log_tree(trans, log_root_tree, &wc);
4303         if (ret) {
4304                 btrfs_error(fs_info, ret, "Failed to pin buffers while "
4305                             "recovering log root tree.");
4306                 goto error;
4307         }
4308
4309 again:
4310         key.objectid = BTRFS_TREE_LOG_OBJECTID;
4311         key.offset = (u64)-1;
4312         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
4313
4314         while (1) {
4315                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
4316
4317                 if (ret < 0) {
4318                         btrfs_error(fs_info, ret,
4319                                     "Couldn't find tree log root.");
4320                         goto error;
4321                 }
4322                 if (ret > 0) {
4323                         if (path->slots[0] == 0)
4324                                 break;
4325                         path->slots[0]--;
4326                 }
4327                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4328                                       path->slots[0]);
4329                 btrfs_release_path(path);
4330                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4331                         break;
4332
4333                 log = btrfs_read_fs_root(log_root_tree, &found_key);
4334                 if (IS_ERR(log)) {
4335                         ret = PTR_ERR(log);
4336                         btrfs_error(fs_info, ret,
4337                                     "Couldn't read tree log root.");
4338                         goto error;
4339                 }
4340
4341                 tmp_key.objectid = found_key.offset;
4342                 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
4343                 tmp_key.offset = (u64)-1;
4344
4345                 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
4346                 if (IS_ERR(wc.replay_dest)) {
4347                         ret = PTR_ERR(wc.replay_dest);
4348                         free_extent_buffer(log->node);
4349                         free_extent_buffer(log->commit_root);
4350                         kfree(log);
4351                         btrfs_error(fs_info, ret, "Couldn't read target root "
4352                                     "for tree log recovery.");
4353                         goto error;
4354                 }
4355
4356                 wc.replay_dest->log_root = log;
4357                 btrfs_record_root_in_trans(trans, wc.replay_dest);
4358                 ret = walk_log_tree(trans, log, &wc);
4359
4360                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
4361                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
4362                                                       path);
4363                 }
4364
4365                 key.offset = found_key.offset - 1;
4366                 wc.replay_dest->log_root = NULL;
4367                 free_extent_buffer(log->node);
4368                 free_extent_buffer(log->commit_root);
4369                 kfree(log);
4370
4371                 if (ret)
4372                         goto error;
4373
4374                 if (found_key.offset == 0)
4375                         break;
4376         }
4377         btrfs_release_path(path);
4378
4379         /* step one is to pin it all, step two is to replay just inodes */
4380         if (wc.pin) {
4381                 wc.pin = 0;
4382                 wc.process_func = replay_one_buffer;
4383                 wc.stage = LOG_WALK_REPLAY_INODES;
4384                 goto again;
4385         }
4386         /* step three is to replay everything */
4387         if (wc.stage < LOG_WALK_REPLAY_ALL) {
4388                 wc.stage++;
4389                 goto again;
4390         }
4391
4392         btrfs_free_path(path);
4393
4394         /* step 4: commit the transaction, which also unpins the blocks */
4395         ret = btrfs_commit_transaction(trans, fs_info->tree_root);
4396         if (ret)
4397                 return ret;
4398
4399         free_extent_buffer(log_root_tree->node);
4400         log_root_tree->log_root = NULL;
4401         fs_info->log_root_recovering = 0;
4402         kfree(log_root_tree);
4403
4404         return 0;
4405 error:
4406         if (wc.trans)
4407                 btrfs_end_transaction(wc.trans, fs_info->tree_root);
4408         btrfs_free_path(path);
4409         return ret;
4410 }
4411
4412 /*
4413  * there are some corner cases where we want to force a full
4414  * commit instead of allowing a directory to be logged.
4415  *
4416  * They revolve around files there were unlinked from the directory, and
4417  * this function updates the parent directory so that a full commit is
4418  * properly done if it is fsync'd later after the unlinks are done.
4419  */
4420 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
4421                              struct inode *dir, struct inode *inode,
4422                              int for_rename)
4423 {
4424         /*
4425          * when we're logging a file, if it hasn't been renamed
4426          * or unlinked, and its inode is fully committed on disk,
4427          * we don't have to worry about walking up the directory chain
4428          * to log its parents.
4429          *
4430          * So, we use the last_unlink_trans field to put this transid
4431          * into the file.  When the file is logged we check it and
4432          * don't log the parents if the file is fully on disk.
4433          */
4434         if (S_ISREG(inode->i_mode))
4435                 BTRFS_I(inode)->last_unlink_trans = trans->transid;
4436
4437         /*
4438          * if this directory was already logged any new
4439          * names for this file/dir will get recorded
4440          */
4441         smp_mb();
4442         if (BTRFS_I(dir)->logged_trans == trans->transid)
4443                 return;
4444
4445         /*
4446          * if the inode we're about to unlink was logged,
4447          * the log will be properly updated for any new names
4448          */
4449         if (BTRFS_I(inode)->logged_trans == trans->transid)
4450                 return;
4451
4452         /*
4453          * when renaming files across directories, if the directory
4454          * there we're unlinking from gets fsync'd later on, there's
4455          * no way to find the destination directory later and fsync it
4456          * properly.  So, we have to be conservative and force commits
4457          * so the new name gets discovered.
4458          */
4459         if (for_rename)
4460                 goto record;
4461
4462         /* we can safely do the unlink without any special recording */
4463         return;
4464
4465 record:
4466         BTRFS_I(dir)->last_unlink_trans = trans->transid;
4467 }
4468
4469 /*
4470  * Call this after adding a new name for a file and it will properly
4471  * update the log to reflect the new name.
4472  *
4473  * It will return zero if all goes well, and it will return 1 if a
4474  * full transaction commit is required.
4475  */
4476 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4477                         struct inode *inode, struct inode *old_dir,
4478                         struct dentry *parent)
4479 {
4480         struct btrfs_root * root = BTRFS_I(inode)->root;
4481
4482         /*
4483          * this will force the logging code to walk the dentry chain
4484          * up for the file
4485          */
4486         if (S_ISREG(inode->i_mode))
4487                 BTRFS_I(inode)->last_unlink_trans = trans->transid;
4488
4489         /*
4490          * if this inode hasn't been logged and directory we're renaming it
4491          * from hasn't been logged, we don't need to log it
4492          */
4493         if (BTRFS_I(inode)->logged_trans <=
4494             root->fs_info->last_trans_committed &&
4495             (!old_dir || BTRFS_I(old_dir)->logged_trans <=
4496                     root->fs_info->last_trans_committed))
4497                 return 0;
4498
4499         return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
4500 }
4501