73e6833dcc21e3e76707ac5afa640e372f41ab7f
[pandora-kernel.git] / fs / btrfs / inode.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/statfs.h>
34 #include <linux/compat.h>
35 #include <linux/bit_spinlock.h>
36 #include <linux/xattr.h>
37 #include <linux/posix_acl.h>
38 #include <linux/falloc.h>
39 #include <linux/slab.h>
40 #include <linux/ratelimit.h>
41 #include <linux/mount.h>
42 #include "compat.h"
43 #include "ctree.h"
44 #include "disk-io.h"
45 #include "transaction.h"
46 #include "btrfs_inode.h"
47 #include "ioctl.h"
48 #include "print-tree.h"
49 #include "ordered-data.h"
50 #include "xattr.h"
51 #include "tree-log.h"
52 #include "volumes.h"
53 #include "compression.h"
54 #include "locking.h"
55 #include "free-space-cache.h"
56 #include "inode-map.h"
57
58 struct btrfs_iget_args {
59         u64 ino;
60         struct btrfs_root *root;
61 };
62
63 static const struct inode_operations btrfs_dir_inode_operations;
64 static const struct inode_operations btrfs_symlink_inode_operations;
65 static const struct inode_operations btrfs_dir_ro_inode_operations;
66 static const struct inode_operations btrfs_special_inode_operations;
67 static const struct inode_operations btrfs_file_inode_operations;
68 static const struct address_space_operations btrfs_aops;
69 static const struct address_space_operations btrfs_symlink_aops;
70 static const struct file_operations btrfs_dir_file_operations;
71 static struct extent_io_ops btrfs_extent_io_ops;
72
73 static struct kmem_cache *btrfs_inode_cachep;
74 static struct kmem_cache *btrfs_delalloc_work_cachep;
75 struct kmem_cache *btrfs_trans_handle_cachep;
76 struct kmem_cache *btrfs_transaction_cachep;
77 struct kmem_cache *btrfs_path_cachep;
78 struct kmem_cache *btrfs_free_space_cachep;
79
80 #define S_SHIFT 12
81 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
82         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
83         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
84         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
85         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
86         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
87         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
88         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
89 };
90
91 static int btrfs_setsize(struct inode *inode, loff_t newsize);
92 static int btrfs_truncate(struct inode *inode);
93 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
94 static noinline int cow_file_range(struct inode *inode,
95                                    struct page *locked_page,
96                                    u64 start, u64 end, int *page_started,
97                                    unsigned long *nr_written, int unlock);
98
99 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
100                                      struct inode *inode,  struct inode *dir,
101                                      const struct qstr *qstr)
102 {
103         int err;
104
105         err = btrfs_init_acl(trans, inode, dir);
106         if (!err)
107                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
108         return err;
109 }
110
111 /*
112  * this does all the hard work for inserting an inline extent into
113  * the btree.  The caller should have done a btrfs_drop_extents so that
114  * no overlapping inline items exist in the btree
115  */
116 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
117                                 struct btrfs_root *root, struct inode *inode,
118                                 u64 start, size_t size, size_t compressed_size,
119                                 int compress_type,
120                                 struct page **compressed_pages)
121 {
122         struct btrfs_key key;
123         struct btrfs_path *path;
124         struct extent_buffer *leaf;
125         struct page *page = NULL;
126         char *kaddr;
127         unsigned long ptr;
128         struct btrfs_file_extent_item *ei;
129         int err = 0;
130         int ret;
131         size_t cur_size = size;
132         size_t datasize;
133         unsigned long offset;
134
135         if (compressed_size && compressed_pages)
136                 cur_size = compressed_size;
137
138         path = btrfs_alloc_path();
139         if (!path)
140                 return -ENOMEM;
141
142         path->leave_spinning = 1;
143
144         key.objectid = btrfs_ino(inode);
145         key.offset = start;
146         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
147         datasize = btrfs_file_extent_calc_inline_size(cur_size);
148
149         inode_add_bytes(inode, size);
150         ret = btrfs_insert_empty_item(trans, root, path, &key,
151                                       datasize);
152         if (ret) {
153                 err = ret;
154                 goto fail;
155         }
156         leaf = path->nodes[0];
157         ei = btrfs_item_ptr(leaf, path->slots[0],
158                             struct btrfs_file_extent_item);
159         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
160         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
161         btrfs_set_file_extent_encryption(leaf, ei, 0);
162         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
163         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
164         ptr = btrfs_file_extent_inline_start(ei);
165
166         if (compress_type != BTRFS_COMPRESS_NONE) {
167                 struct page *cpage;
168                 int i = 0;
169                 while (compressed_size > 0) {
170                         cpage = compressed_pages[i];
171                         cur_size = min_t(unsigned long, compressed_size,
172                                        PAGE_CACHE_SIZE);
173
174                         kaddr = kmap_atomic(cpage);
175                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
176                         kunmap_atomic(kaddr);
177
178                         i++;
179                         ptr += cur_size;
180                         compressed_size -= cur_size;
181                 }
182                 btrfs_set_file_extent_compression(leaf, ei,
183                                                   compress_type);
184         } else {
185                 page = find_get_page(inode->i_mapping,
186                                      start >> PAGE_CACHE_SHIFT);
187                 btrfs_set_file_extent_compression(leaf, ei, 0);
188                 kaddr = kmap_atomic(page);
189                 offset = start & (PAGE_CACHE_SIZE - 1);
190                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
191                 kunmap_atomic(kaddr);
192                 page_cache_release(page);
193         }
194         btrfs_mark_buffer_dirty(leaf);
195         btrfs_free_path(path);
196
197         /*
198          * we're an inline extent, so nobody can
199          * extend the file past i_size without locking
200          * a page we already have locked.
201          *
202          * We must do any isize and inode updates
203          * before we unlock the pages.  Otherwise we
204          * could end up racing with unlink.
205          */
206         BTRFS_I(inode)->disk_i_size = inode->i_size;
207         ret = btrfs_update_inode(trans, root, inode);
208
209         return ret;
210 fail:
211         btrfs_free_path(path);
212         return err;
213 }
214
215
216 /*
217  * conditionally insert an inline extent into the file.  This
218  * does the checks required to make sure the data is small enough
219  * to fit as an inline extent.
220  */
221 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
222                                  struct btrfs_root *root,
223                                  struct inode *inode, u64 start, u64 end,
224                                  size_t compressed_size, int compress_type,
225                                  struct page **compressed_pages)
226 {
227         u64 isize = i_size_read(inode);
228         u64 actual_end = min(end + 1, isize);
229         u64 inline_len = actual_end - start;
230         u64 aligned_end = (end + root->sectorsize - 1) &
231                         ~((u64)root->sectorsize - 1);
232         u64 data_len = inline_len;
233         int ret;
234
235         if (compressed_size)
236                 data_len = compressed_size;
237
238         if (start > 0 ||
239             actual_end >= PAGE_CACHE_SIZE ||
240             data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
241             (!compressed_size &&
242             (actual_end & (root->sectorsize - 1)) == 0) ||
243             end + 1 < isize ||
244             data_len > root->fs_info->max_inline) {
245                 return 1;
246         }
247
248         ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
249         if (ret)
250                 return ret;
251
252         if (isize > actual_end)
253                 inline_len = min_t(u64, isize, actual_end);
254         ret = insert_inline_extent(trans, root, inode, start,
255                                    inline_len, compressed_size,
256                                    compress_type, compressed_pages);
257         if (ret && ret != -ENOSPC) {
258                 btrfs_abort_transaction(trans, root, ret);
259                 return ret;
260         } else if (ret == -ENOSPC) {
261                 return 1;
262         }
263
264         btrfs_delalloc_release_metadata(inode, end + 1 - start);
265         btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
266         return 0;
267 }
268
269 struct async_extent {
270         u64 start;
271         u64 ram_size;
272         u64 compressed_size;
273         struct page **pages;
274         unsigned long nr_pages;
275         int compress_type;
276         struct list_head list;
277 };
278
279 struct async_cow {
280         struct inode *inode;
281         struct btrfs_root *root;
282         struct page *locked_page;
283         u64 start;
284         u64 end;
285         struct list_head extents;
286         struct btrfs_work work;
287 };
288
289 static noinline int add_async_extent(struct async_cow *cow,
290                                      u64 start, u64 ram_size,
291                                      u64 compressed_size,
292                                      struct page **pages,
293                                      unsigned long nr_pages,
294                                      int compress_type)
295 {
296         struct async_extent *async_extent;
297
298         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
299         BUG_ON(!async_extent); /* -ENOMEM */
300         async_extent->start = start;
301         async_extent->ram_size = ram_size;
302         async_extent->compressed_size = compressed_size;
303         async_extent->pages = pages;
304         async_extent->nr_pages = nr_pages;
305         async_extent->compress_type = compress_type;
306         list_add_tail(&async_extent->list, &cow->extents);
307         return 0;
308 }
309
310 /*
311  * we create compressed extents in two phases.  The first
312  * phase compresses a range of pages that have already been
313  * locked (both pages and state bits are locked).
314  *
315  * This is done inside an ordered work queue, and the compression
316  * is spread across many cpus.  The actual IO submission is step
317  * two, and the ordered work queue takes care of making sure that
318  * happens in the same order things were put onto the queue by
319  * writepages and friends.
320  *
321  * If this code finds it can't get good compression, it puts an
322  * entry onto the work queue to write the uncompressed bytes.  This
323  * makes sure that both compressed inodes and uncompressed inodes
324  * are written in the same order that the flusher thread sent them
325  * down.
326  */
327 static noinline int compress_file_range(struct inode *inode,
328                                         struct page *locked_page,
329                                         u64 start, u64 end,
330                                         struct async_cow *async_cow,
331                                         int *num_added)
332 {
333         struct btrfs_root *root = BTRFS_I(inode)->root;
334         struct btrfs_trans_handle *trans;
335         u64 num_bytes;
336         u64 blocksize = root->sectorsize;
337         u64 actual_end;
338         u64 isize = i_size_read(inode);
339         int ret = 0;
340         struct page **pages = NULL;
341         unsigned long nr_pages;
342         unsigned long nr_pages_ret = 0;
343         unsigned long total_compressed = 0;
344         unsigned long total_in = 0;
345         unsigned long max_compressed = 128 * 1024;
346         unsigned long max_uncompressed = 128 * 1024;
347         int i;
348         int will_compress;
349         int compress_type = root->fs_info->compress_type;
350
351         /* if this is a small write inside eof, kick off a defrag */
352         if ((end - start + 1) < 16 * 1024 &&
353             (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
354                 btrfs_add_inode_defrag(NULL, inode);
355
356         actual_end = min_t(u64, isize, end + 1);
357 again:
358         will_compress = 0;
359         nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
360         nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
361
362         /*
363          * we don't want to send crud past the end of i_size through
364          * compression, that's just a waste of CPU time.  So, if the
365          * end of the file is before the start of our current
366          * requested range of bytes, we bail out to the uncompressed
367          * cleanup code that can deal with all of this.
368          *
369          * It isn't really the fastest way to fix things, but this is a
370          * very uncommon corner.
371          */
372         if (actual_end <= start)
373                 goto cleanup_and_bail_uncompressed;
374
375         total_compressed = actual_end - start;
376
377         /* we want to make sure that amount of ram required to uncompress
378          * an extent is reasonable, so we limit the total size in ram
379          * of a compressed extent to 128k.  This is a crucial number
380          * because it also controls how easily we can spread reads across
381          * cpus for decompression.
382          *
383          * We also want to make sure the amount of IO required to do
384          * a random read is reasonably small, so we limit the size of
385          * a compressed extent to 128k.
386          */
387         total_compressed = min(total_compressed, max_uncompressed);
388         num_bytes = (end - start + blocksize) & ~(blocksize - 1);
389         num_bytes = max(blocksize,  num_bytes);
390         total_in = 0;
391         ret = 0;
392
393         /*
394          * we do compression for mount -o compress and when the
395          * inode has not been flagged as nocompress.  This flag can
396          * change at any time if we discover bad compression ratios.
397          */
398         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
399             (btrfs_test_opt(root, COMPRESS) ||
400              (BTRFS_I(inode)->force_compress) ||
401              (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
402                 WARN_ON(pages);
403                 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
404                 if (!pages) {
405                         /* just bail out to the uncompressed code */
406                         goto cont;
407                 }
408
409                 if (BTRFS_I(inode)->force_compress)
410                         compress_type = BTRFS_I(inode)->force_compress;
411
412                 ret = btrfs_compress_pages(compress_type,
413                                            inode->i_mapping, start,
414                                            total_compressed, pages,
415                                            nr_pages, &nr_pages_ret,
416                                            &total_in,
417                                            &total_compressed,
418                                            max_compressed);
419
420                 if (!ret) {
421                         unsigned long offset = total_compressed &
422                                 (PAGE_CACHE_SIZE - 1);
423                         struct page *page = pages[nr_pages_ret - 1];
424                         char *kaddr;
425
426                         /* zero the tail end of the last page, we might be
427                          * sending it down to disk
428                          */
429                         if (offset) {
430                                 kaddr = kmap_atomic(page);
431                                 memset(kaddr + offset, 0,
432                                        PAGE_CACHE_SIZE - offset);
433                                 kunmap_atomic(kaddr);
434                         }
435                         will_compress = 1;
436                 }
437         }
438 cont:
439         if (start == 0) {
440                 trans = btrfs_join_transaction(root);
441                 if (IS_ERR(trans)) {
442                         ret = PTR_ERR(trans);
443                         trans = NULL;
444                         goto cleanup_and_out;
445                 }
446                 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
447
448                 /* lets try to make an inline extent */
449                 if (ret || total_in < (actual_end - start)) {
450                         /* we didn't compress the entire range, try
451                          * to make an uncompressed inline extent.
452                          */
453                         ret = cow_file_range_inline(trans, root, inode,
454                                                     start, end, 0, 0, NULL);
455                 } else {
456                         /* try making a compressed inline extent */
457                         ret = cow_file_range_inline(trans, root, inode,
458                                                     start, end,
459                                                     total_compressed,
460                                                     compress_type, pages);
461                 }
462                 if (ret <= 0) {
463                         /*
464                          * inline extent creation worked or returned error,
465                          * we don't need to create any more async work items.
466                          * Unlock and free up our temp pages.
467                          */
468                         extent_clear_unlock_delalloc(inode,
469                              &BTRFS_I(inode)->io_tree,
470                              start, end, NULL,
471                              EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
472                              EXTENT_CLEAR_DELALLOC |
473                              EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
474
475                         btrfs_end_transaction(trans, root);
476                         goto free_pages_out;
477                 }
478                 btrfs_end_transaction(trans, root);
479         }
480
481         if (will_compress) {
482                 /*
483                  * we aren't doing an inline extent round the compressed size
484                  * up to a block size boundary so the allocator does sane
485                  * things
486                  */
487                 total_compressed = (total_compressed + blocksize - 1) &
488                         ~(blocksize - 1);
489
490                 /*
491                  * one last check to make sure the compression is really a
492                  * win, compare the page count read with the blocks on disk
493                  */
494                 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
495                         ~(PAGE_CACHE_SIZE - 1);
496                 if (total_compressed >= total_in) {
497                         will_compress = 0;
498                 } else {
499                         num_bytes = total_in;
500                 }
501         }
502         if (!will_compress && pages) {
503                 /*
504                  * the compression code ran but failed to make things smaller,
505                  * free any pages it allocated and our page pointer array
506                  */
507                 for (i = 0; i < nr_pages_ret; i++) {
508                         WARN_ON(pages[i]->mapping);
509                         page_cache_release(pages[i]);
510                 }
511                 kfree(pages);
512                 pages = NULL;
513                 total_compressed = 0;
514                 nr_pages_ret = 0;
515
516                 /* flag the file so we don't compress in the future */
517                 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
518                     !(BTRFS_I(inode)->force_compress)) {
519                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
520                 }
521         }
522         if (will_compress) {
523                 *num_added += 1;
524
525                 /* the async work queues will take care of doing actual
526                  * allocation on disk for these compressed pages,
527                  * and will submit them to the elevator.
528                  */
529                 add_async_extent(async_cow, start, num_bytes,
530                                  total_compressed, pages, nr_pages_ret,
531                                  compress_type);
532
533                 if (start + num_bytes < end) {
534                         start += num_bytes;
535                         pages = NULL;
536                         cond_resched();
537                         goto again;
538                 }
539         } else {
540 cleanup_and_bail_uncompressed:
541                 /*
542                  * No compression, but we still need to write the pages in
543                  * the file we've been given so far.  redirty the locked
544                  * page if it corresponds to our extent and set things up
545                  * for the async work queue to run cow_file_range to do
546                  * the normal delalloc dance
547                  */
548                 if (page_offset(locked_page) >= start &&
549                     page_offset(locked_page) <= end) {
550                         __set_page_dirty_nobuffers(locked_page);
551                         /* unlocked later on in the async handlers */
552                 }
553                 add_async_extent(async_cow, start, end - start + 1,
554                                  0, NULL, 0, BTRFS_COMPRESS_NONE);
555                 *num_added += 1;
556         }
557
558 out:
559         return ret;
560
561 free_pages_out:
562         for (i = 0; i < nr_pages_ret; i++) {
563                 WARN_ON(pages[i]->mapping);
564                 page_cache_release(pages[i]);
565         }
566         kfree(pages);
567
568         goto out;
569
570 cleanup_and_out:
571         extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
572                                      start, end, NULL,
573                                      EXTENT_CLEAR_UNLOCK_PAGE |
574                                      EXTENT_CLEAR_DIRTY |
575                                      EXTENT_CLEAR_DELALLOC |
576                                      EXTENT_SET_WRITEBACK |
577                                      EXTENT_END_WRITEBACK);
578         if (!trans || IS_ERR(trans))
579                 btrfs_error(root->fs_info, ret, "Failed to join transaction");
580         else
581                 btrfs_abort_transaction(trans, root, ret);
582         goto free_pages_out;
583 }
584
585 /*
586  * phase two of compressed writeback.  This is the ordered portion
587  * of the code, which only gets called in the order the work was
588  * queued.  We walk all the async extents created by compress_file_range
589  * and send them down to the disk.
590  */
591 static noinline int submit_compressed_extents(struct inode *inode,
592                                               struct async_cow *async_cow)
593 {
594         struct async_extent *async_extent;
595         u64 alloc_hint = 0;
596         struct btrfs_trans_handle *trans;
597         struct btrfs_key ins;
598         struct extent_map *em;
599         struct btrfs_root *root = BTRFS_I(inode)->root;
600         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
601         struct extent_io_tree *io_tree;
602         int ret = 0;
603
604         if (list_empty(&async_cow->extents))
605                 return 0;
606
607
608         while (!list_empty(&async_cow->extents)) {
609                 async_extent = list_entry(async_cow->extents.next,
610                                           struct async_extent, list);
611                 list_del(&async_extent->list);
612
613                 io_tree = &BTRFS_I(inode)->io_tree;
614
615 retry:
616                 /* did the compression code fall back to uncompressed IO? */
617                 if (!async_extent->pages) {
618                         int page_started = 0;
619                         unsigned long nr_written = 0;
620
621                         lock_extent(io_tree, async_extent->start,
622                                          async_extent->start +
623                                          async_extent->ram_size - 1);
624
625                         /* allocate blocks */
626                         ret = cow_file_range(inode, async_cow->locked_page,
627                                              async_extent->start,
628                                              async_extent->start +
629                                              async_extent->ram_size - 1,
630                                              &page_started, &nr_written, 0);
631
632                         /* JDM XXX */
633
634                         /*
635                          * if page_started, cow_file_range inserted an
636                          * inline extent and took care of all the unlocking
637                          * and IO for us.  Otherwise, we need to submit
638                          * all those pages down to the drive.
639                          */
640                         if (!page_started && !ret)
641                                 extent_write_locked_range(io_tree,
642                                                   inode, async_extent->start,
643                                                   async_extent->start +
644                                                   async_extent->ram_size - 1,
645                                                   btrfs_get_extent,
646                                                   WB_SYNC_ALL);
647                         kfree(async_extent);
648                         cond_resched();
649                         continue;
650                 }
651
652                 lock_extent(io_tree, async_extent->start,
653                             async_extent->start + async_extent->ram_size - 1);
654
655                 trans = btrfs_join_transaction(root);
656                 if (IS_ERR(trans)) {
657                         ret = PTR_ERR(trans);
658                 } else {
659                         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
660                         ret = btrfs_reserve_extent(trans, root,
661                                            async_extent->compressed_size,
662                                            async_extent->compressed_size,
663                                            0, alloc_hint, &ins, 1);
664                         if (ret && ret != -ENOSPC)
665                                 btrfs_abort_transaction(trans, root, ret);
666                         btrfs_end_transaction(trans, root);
667                 }
668
669                 if (ret) {
670                         int i;
671                         for (i = 0; i < async_extent->nr_pages; i++) {
672                                 WARN_ON(async_extent->pages[i]->mapping);
673                                 page_cache_release(async_extent->pages[i]);
674                         }
675                         kfree(async_extent->pages);
676                         async_extent->nr_pages = 0;
677                         async_extent->pages = NULL;
678                         unlock_extent(io_tree, async_extent->start,
679                                       async_extent->start +
680                                       async_extent->ram_size - 1);
681                         if (ret == -ENOSPC)
682                                 goto retry;
683                         goto out_free; /* JDM: Requeue? */
684                 }
685
686                 /*
687                  * here we're doing allocation and writeback of the
688                  * compressed pages
689                  */
690                 btrfs_drop_extent_cache(inode, async_extent->start,
691                                         async_extent->start +
692                                         async_extent->ram_size - 1, 0);
693
694                 em = alloc_extent_map();
695                 BUG_ON(!em); /* -ENOMEM */
696                 em->start = async_extent->start;
697                 em->len = async_extent->ram_size;
698                 em->orig_start = em->start;
699
700                 em->block_start = ins.objectid;
701                 em->block_len = ins.offset;
702                 em->orig_block_len = ins.offset;
703                 em->bdev = root->fs_info->fs_devices->latest_bdev;
704                 em->compress_type = async_extent->compress_type;
705                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
706                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
707
708                 while (1) {
709                         write_lock(&em_tree->lock);
710                         ret = add_extent_mapping(em_tree, em);
711                         write_unlock(&em_tree->lock);
712                         if (ret != -EEXIST) {
713                                 free_extent_map(em);
714                                 break;
715                         }
716                         btrfs_drop_extent_cache(inode, async_extent->start,
717                                                 async_extent->start +
718                                                 async_extent->ram_size - 1, 0);
719                 }
720
721                 ret = btrfs_add_ordered_extent_compress(inode,
722                                                 async_extent->start,
723                                                 ins.objectid,
724                                                 async_extent->ram_size,
725                                                 ins.offset,
726                                                 BTRFS_ORDERED_COMPRESSED,
727                                                 async_extent->compress_type);
728                 BUG_ON(ret); /* -ENOMEM */
729
730                 /*
731                  * clear dirty, set writeback and unlock the pages.
732                  */
733                 extent_clear_unlock_delalloc(inode,
734                                 &BTRFS_I(inode)->io_tree,
735                                 async_extent->start,
736                                 async_extent->start +
737                                 async_extent->ram_size - 1,
738                                 NULL, EXTENT_CLEAR_UNLOCK_PAGE |
739                                 EXTENT_CLEAR_UNLOCK |
740                                 EXTENT_CLEAR_DELALLOC |
741                                 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
742
743                 ret = btrfs_submit_compressed_write(inode,
744                                     async_extent->start,
745                                     async_extent->ram_size,
746                                     ins.objectid,
747                                     ins.offset, async_extent->pages,
748                                     async_extent->nr_pages);
749
750                 BUG_ON(ret); /* -ENOMEM */
751                 alloc_hint = ins.objectid + ins.offset;
752                 kfree(async_extent);
753                 cond_resched();
754         }
755         ret = 0;
756 out:
757         return ret;
758 out_free:
759         kfree(async_extent);
760         goto out;
761 }
762
763 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
764                                       u64 num_bytes)
765 {
766         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
767         struct extent_map *em;
768         u64 alloc_hint = 0;
769
770         read_lock(&em_tree->lock);
771         em = search_extent_mapping(em_tree, start, num_bytes);
772         if (em) {
773                 /*
774                  * if block start isn't an actual block number then find the
775                  * first block in this inode and use that as a hint.  If that
776                  * block is also bogus then just don't worry about it.
777                  */
778                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
779                         free_extent_map(em);
780                         em = search_extent_mapping(em_tree, 0, 0);
781                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
782                                 alloc_hint = em->block_start;
783                         if (em)
784                                 free_extent_map(em);
785                 } else {
786                         alloc_hint = em->block_start;
787                         free_extent_map(em);
788                 }
789         }
790         read_unlock(&em_tree->lock);
791
792         return alloc_hint;
793 }
794
795 /*
796  * when extent_io.c finds a delayed allocation range in the file,
797  * the call backs end up in this code.  The basic idea is to
798  * allocate extents on disk for the range, and create ordered data structs
799  * in ram to track those extents.
800  *
801  * locked_page is the page that writepage had locked already.  We use
802  * it to make sure we don't do extra locks or unlocks.
803  *
804  * *page_started is set to one if we unlock locked_page and do everything
805  * required to start IO on it.  It may be clean and already done with
806  * IO when we return.
807  */
808 static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
809                                      struct inode *inode,
810                                      struct btrfs_root *root,
811                                      struct page *locked_page,
812                                      u64 start, u64 end, int *page_started,
813                                      unsigned long *nr_written,
814                                      int unlock)
815 {
816         u64 alloc_hint = 0;
817         u64 num_bytes;
818         unsigned long ram_size;
819         u64 disk_num_bytes;
820         u64 cur_alloc_size;
821         u64 blocksize = root->sectorsize;
822         struct btrfs_key ins;
823         struct extent_map *em;
824         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
825         int ret = 0;
826
827         BUG_ON(btrfs_is_free_space_inode(inode));
828
829         num_bytes = (end - start + blocksize) & ~(blocksize - 1);
830         num_bytes = max(blocksize,  num_bytes);
831         disk_num_bytes = num_bytes;
832
833         /* if this is a small write inside eof, kick off defrag */
834         if (num_bytes < 64 * 1024 &&
835             (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
836                 btrfs_add_inode_defrag(trans, inode);
837
838         if (start == 0) {
839                 /* lets try to make an inline extent */
840                 ret = cow_file_range_inline(trans, root, inode,
841                                             start, end, 0, 0, NULL);
842                 if (ret == 0) {
843                         extent_clear_unlock_delalloc(inode,
844                                      &BTRFS_I(inode)->io_tree,
845                                      start, end, NULL,
846                                      EXTENT_CLEAR_UNLOCK_PAGE |
847                                      EXTENT_CLEAR_UNLOCK |
848                                      EXTENT_CLEAR_DELALLOC |
849                                      EXTENT_CLEAR_DIRTY |
850                                      EXTENT_SET_WRITEBACK |
851                                      EXTENT_END_WRITEBACK);
852
853                         *nr_written = *nr_written +
854                              (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
855                         *page_started = 1;
856                         goto out;
857                 } else if (ret < 0) {
858                         btrfs_abort_transaction(trans, root, ret);
859                         goto out_unlock;
860                 }
861         }
862
863         BUG_ON(disk_num_bytes >
864                btrfs_super_total_bytes(root->fs_info->super_copy));
865
866         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
867         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
868
869         while (disk_num_bytes > 0) {
870                 unsigned long op;
871
872                 cur_alloc_size = disk_num_bytes;
873                 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
874                                            root->sectorsize, 0, alloc_hint,
875                                            &ins, 1);
876                 if (ret < 0) {
877                         btrfs_abort_transaction(trans, root, ret);
878                         goto out_unlock;
879                 }
880
881                 em = alloc_extent_map();
882                 BUG_ON(!em); /* -ENOMEM */
883                 em->start = start;
884                 em->orig_start = em->start;
885                 ram_size = ins.offset;
886                 em->len = ins.offset;
887
888                 em->block_start = ins.objectid;
889                 em->block_len = ins.offset;
890                 em->orig_block_len = ins.offset;
891                 em->bdev = root->fs_info->fs_devices->latest_bdev;
892                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
893
894                 while (1) {
895                         write_lock(&em_tree->lock);
896                         ret = add_extent_mapping(em_tree, em);
897                         write_unlock(&em_tree->lock);
898                         if (ret != -EEXIST) {
899                                 free_extent_map(em);
900                                 break;
901                         }
902                         btrfs_drop_extent_cache(inode, start,
903                                                 start + ram_size - 1, 0);
904                 }
905
906                 cur_alloc_size = ins.offset;
907                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
908                                                ram_size, cur_alloc_size, 0);
909                 BUG_ON(ret); /* -ENOMEM */
910
911                 if (root->root_key.objectid ==
912                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
913                         ret = btrfs_reloc_clone_csums(inode, start,
914                                                       cur_alloc_size);
915                         if (ret) {
916                                 btrfs_abort_transaction(trans, root, ret);
917                                 goto out_unlock;
918                         }
919                 }
920
921                 if (disk_num_bytes < cur_alloc_size)
922                         break;
923
924                 /* we're not doing compressed IO, don't unlock the first
925                  * page (which the caller expects to stay locked), don't
926                  * clear any dirty bits and don't set any writeback bits
927                  *
928                  * Do set the Private2 bit so we know this page was properly
929                  * setup for writepage
930                  */
931                 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
932                 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
933                         EXTENT_SET_PRIVATE2;
934
935                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
936                                              start, start + ram_size - 1,
937                                              locked_page, op);
938                 disk_num_bytes -= cur_alloc_size;
939                 num_bytes -= cur_alloc_size;
940                 alloc_hint = ins.objectid + ins.offset;
941                 start += cur_alloc_size;
942         }
943 out:
944         return ret;
945
946 out_unlock:
947         extent_clear_unlock_delalloc(inode,
948                      &BTRFS_I(inode)->io_tree,
949                      start, end, locked_page,
950                      EXTENT_CLEAR_UNLOCK_PAGE |
951                      EXTENT_CLEAR_UNLOCK |
952                      EXTENT_CLEAR_DELALLOC |
953                      EXTENT_CLEAR_DIRTY |
954                      EXTENT_SET_WRITEBACK |
955                      EXTENT_END_WRITEBACK);
956
957         goto out;
958 }
959
960 static noinline int cow_file_range(struct inode *inode,
961                                    struct page *locked_page,
962                                    u64 start, u64 end, int *page_started,
963                                    unsigned long *nr_written,
964                                    int unlock)
965 {
966         struct btrfs_trans_handle *trans;
967         struct btrfs_root *root = BTRFS_I(inode)->root;
968         int ret;
969
970         trans = btrfs_join_transaction(root);
971         if (IS_ERR(trans)) {
972                 extent_clear_unlock_delalloc(inode,
973                              &BTRFS_I(inode)->io_tree,
974                              start, end, locked_page,
975                              EXTENT_CLEAR_UNLOCK_PAGE |
976                              EXTENT_CLEAR_UNLOCK |
977                              EXTENT_CLEAR_DELALLOC |
978                              EXTENT_CLEAR_DIRTY |
979                              EXTENT_SET_WRITEBACK |
980                              EXTENT_END_WRITEBACK);
981                 return PTR_ERR(trans);
982         }
983         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
984
985         ret = __cow_file_range(trans, inode, root, locked_page, start, end,
986                                page_started, nr_written, unlock);
987
988         btrfs_end_transaction(trans, root);
989
990         return ret;
991 }
992
993 /*
994  * work queue call back to started compression on a file and pages
995  */
996 static noinline void async_cow_start(struct btrfs_work *work)
997 {
998         struct async_cow *async_cow;
999         int num_added = 0;
1000         async_cow = container_of(work, struct async_cow, work);
1001
1002         compress_file_range(async_cow->inode, async_cow->locked_page,
1003                             async_cow->start, async_cow->end, async_cow,
1004                             &num_added);
1005         if (num_added == 0) {
1006                 btrfs_add_delayed_iput(async_cow->inode);
1007                 async_cow->inode = NULL;
1008         }
1009 }
1010
1011 /*
1012  * work queue call back to submit previously compressed pages
1013  */
1014 static noinline void async_cow_submit(struct btrfs_work *work)
1015 {
1016         struct async_cow *async_cow;
1017         struct btrfs_root *root;
1018         unsigned long nr_pages;
1019
1020         async_cow = container_of(work, struct async_cow, work);
1021
1022         root = async_cow->root;
1023         nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1024                 PAGE_CACHE_SHIFT;
1025
1026         if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1027             5 * 1024 * 1024 &&
1028             waitqueue_active(&root->fs_info->async_submit_wait))
1029                 wake_up(&root->fs_info->async_submit_wait);
1030
1031         if (async_cow->inode)
1032                 submit_compressed_extents(async_cow->inode, async_cow);
1033 }
1034
1035 static noinline void async_cow_free(struct btrfs_work *work)
1036 {
1037         struct async_cow *async_cow;
1038         async_cow = container_of(work, struct async_cow, work);
1039         if (async_cow->inode)
1040                 btrfs_add_delayed_iput(async_cow->inode);
1041         kfree(async_cow);
1042 }
1043
1044 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1045                                 u64 start, u64 end, int *page_started,
1046                                 unsigned long *nr_written)
1047 {
1048         struct async_cow *async_cow;
1049         struct btrfs_root *root = BTRFS_I(inode)->root;
1050         unsigned long nr_pages;
1051         u64 cur_end;
1052         int limit = 10 * 1024 * 1024;
1053
1054         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1055                          1, 0, NULL, GFP_NOFS);
1056         while (start < end) {
1057                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1058                 BUG_ON(!async_cow); /* -ENOMEM */
1059                 async_cow->inode = igrab(inode);
1060                 async_cow->root = root;
1061                 async_cow->locked_page = locked_page;
1062                 async_cow->start = start;
1063
1064                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
1065                         cur_end = end;
1066                 else
1067                         cur_end = min(end, start + 512 * 1024 - 1);
1068
1069                 async_cow->end = cur_end;
1070                 INIT_LIST_HEAD(&async_cow->extents);
1071
1072                 async_cow->work.func = async_cow_start;
1073                 async_cow->work.ordered_func = async_cow_submit;
1074                 async_cow->work.ordered_free = async_cow_free;
1075                 async_cow->work.flags = 0;
1076
1077                 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1078                         PAGE_CACHE_SHIFT;
1079                 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1080
1081                 btrfs_queue_worker(&root->fs_info->delalloc_workers,
1082                                    &async_cow->work);
1083
1084                 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1085                         wait_event(root->fs_info->async_submit_wait,
1086                            (atomic_read(&root->fs_info->async_delalloc_pages) <
1087                             limit));
1088                 }
1089
1090                 while (atomic_read(&root->fs_info->async_submit_draining) &&
1091                       atomic_read(&root->fs_info->async_delalloc_pages)) {
1092                         wait_event(root->fs_info->async_submit_wait,
1093                           (atomic_read(&root->fs_info->async_delalloc_pages) ==
1094                            0));
1095                 }
1096
1097                 *nr_written += nr_pages;
1098                 start = cur_end + 1;
1099         }
1100         *page_started = 1;
1101         return 0;
1102 }
1103
1104 static noinline int csum_exist_in_range(struct btrfs_root *root,
1105                                         u64 bytenr, u64 num_bytes)
1106 {
1107         int ret;
1108         struct btrfs_ordered_sum *sums;
1109         LIST_HEAD(list);
1110
1111         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1112                                        bytenr + num_bytes - 1, &list, 0);
1113         if (ret == 0 && list_empty(&list))
1114                 return 0;
1115
1116         while (!list_empty(&list)) {
1117                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1118                 list_del(&sums->list);
1119                 kfree(sums);
1120         }
1121         return 1;
1122 }
1123
1124 /*
1125  * when nowcow writeback call back.  This checks for snapshots or COW copies
1126  * of the extents that exist in the file, and COWs the file as required.
1127  *
1128  * If no cow copies or snapshots exist, we write directly to the existing
1129  * blocks on disk
1130  */
1131 static noinline int run_delalloc_nocow(struct inode *inode,
1132                                        struct page *locked_page,
1133                               u64 start, u64 end, int *page_started, int force,
1134                               unsigned long *nr_written)
1135 {
1136         struct btrfs_root *root = BTRFS_I(inode)->root;
1137         struct btrfs_trans_handle *trans;
1138         struct extent_buffer *leaf;
1139         struct btrfs_path *path;
1140         struct btrfs_file_extent_item *fi;
1141         struct btrfs_key found_key;
1142         u64 cow_start;
1143         u64 cur_offset;
1144         u64 extent_end;
1145         u64 extent_offset;
1146         u64 disk_bytenr;
1147         u64 num_bytes;
1148         u64 disk_num_bytes;
1149         int extent_type;
1150         int ret, err;
1151         int type;
1152         int nocow;
1153         int check_prev = 1;
1154         bool nolock;
1155         u64 ino = btrfs_ino(inode);
1156
1157         path = btrfs_alloc_path();
1158         if (!path) {
1159                 extent_clear_unlock_delalloc(inode,
1160                              &BTRFS_I(inode)->io_tree,
1161                              start, end, locked_page,
1162                              EXTENT_CLEAR_UNLOCK_PAGE |
1163                              EXTENT_CLEAR_UNLOCK |
1164                              EXTENT_CLEAR_DELALLOC |
1165                              EXTENT_CLEAR_DIRTY |
1166                              EXTENT_SET_WRITEBACK |
1167                              EXTENT_END_WRITEBACK);
1168                 return -ENOMEM;
1169         }
1170
1171         nolock = btrfs_is_free_space_inode(inode);
1172
1173         if (nolock)
1174                 trans = btrfs_join_transaction_nolock(root);
1175         else
1176                 trans = btrfs_join_transaction(root);
1177
1178         if (IS_ERR(trans)) {
1179                 extent_clear_unlock_delalloc(inode,
1180                              &BTRFS_I(inode)->io_tree,
1181                              start, end, locked_page,
1182                              EXTENT_CLEAR_UNLOCK_PAGE |
1183                              EXTENT_CLEAR_UNLOCK |
1184                              EXTENT_CLEAR_DELALLOC |
1185                              EXTENT_CLEAR_DIRTY |
1186                              EXTENT_SET_WRITEBACK |
1187                              EXTENT_END_WRITEBACK);
1188                 btrfs_free_path(path);
1189                 return PTR_ERR(trans);
1190         }
1191
1192         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1193
1194         cow_start = (u64)-1;
1195         cur_offset = start;
1196         while (1) {
1197                 ret = btrfs_lookup_file_extent(trans, root, path, ino,
1198                                                cur_offset, 0);
1199                 if (ret < 0) {
1200                         btrfs_abort_transaction(trans, root, ret);
1201                         goto error;
1202                 }
1203                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1204                         leaf = path->nodes[0];
1205                         btrfs_item_key_to_cpu(leaf, &found_key,
1206                                               path->slots[0] - 1);
1207                         if (found_key.objectid == ino &&
1208                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1209                                 path->slots[0]--;
1210                 }
1211                 check_prev = 0;
1212 next_slot:
1213                 leaf = path->nodes[0];
1214                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1215                         ret = btrfs_next_leaf(root, path);
1216                         if (ret < 0) {
1217                                 btrfs_abort_transaction(trans, root, ret);
1218                                 goto error;
1219                         }
1220                         if (ret > 0)
1221                                 break;
1222                         leaf = path->nodes[0];
1223                 }
1224
1225                 nocow = 0;
1226                 disk_bytenr = 0;
1227                 num_bytes = 0;
1228                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1229
1230                 if (found_key.objectid > ino ||
1231                     found_key.type > BTRFS_EXTENT_DATA_KEY ||
1232                     found_key.offset > end)
1233                         break;
1234
1235                 if (found_key.offset > cur_offset) {
1236                         extent_end = found_key.offset;
1237                         extent_type = 0;
1238                         goto out_check;
1239                 }
1240
1241                 fi = btrfs_item_ptr(leaf, path->slots[0],
1242                                     struct btrfs_file_extent_item);
1243                 extent_type = btrfs_file_extent_type(leaf, fi);
1244
1245                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1246                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1247                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1248                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1249                         extent_end = found_key.offset +
1250                                 btrfs_file_extent_num_bytes(leaf, fi);
1251                         disk_num_bytes =
1252                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1253                         if (extent_end <= start) {
1254                                 path->slots[0]++;
1255                                 goto next_slot;
1256                         }
1257                         if (disk_bytenr == 0)
1258                                 goto out_check;
1259                         if (btrfs_file_extent_compression(leaf, fi) ||
1260                             btrfs_file_extent_encryption(leaf, fi) ||
1261                             btrfs_file_extent_other_encoding(leaf, fi))
1262                                 goto out_check;
1263                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1264                                 goto out_check;
1265                         if (btrfs_extent_readonly(root, disk_bytenr))
1266                                 goto out_check;
1267                         if (btrfs_cross_ref_exist(trans, root, ino,
1268                                                   found_key.offset -
1269                                                   extent_offset, disk_bytenr))
1270                                 goto out_check;
1271                         disk_bytenr += extent_offset;
1272                         disk_bytenr += cur_offset - found_key.offset;
1273                         num_bytes = min(end + 1, extent_end) - cur_offset;
1274                         /*
1275                          * force cow if csum exists in the range.
1276                          * this ensure that csum for a given extent are
1277                          * either valid or do not exist.
1278                          */
1279                         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1280                                 goto out_check;
1281                         nocow = 1;
1282                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1283                         extent_end = found_key.offset +
1284                                 btrfs_file_extent_inline_len(leaf, fi);
1285                         extent_end = ALIGN(extent_end, root->sectorsize);
1286                 } else {
1287                         BUG_ON(1);
1288                 }
1289 out_check:
1290                 if (extent_end <= start) {
1291                         path->slots[0]++;
1292                         goto next_slot;
1293                 }
1294                 if (!nocow) {
1295                         if (cow_start == (u64)-1)
1296                                 cow_start = cur_offset;
1297                         cur_offset = extent_end;
1298                         if (cur_offset > end)
1299                                 break;
1300                         path->slots[0]++;
1301                         goto next_slot;
1302                 }
1303
1304                 btrfs_release_path(path);
1305                 if (cow_start != (u64)-1) {
1306                         ret = __cow_file_range(trans, inode, root, locked_page,
1307                                                cow_start, found_key.offset - 1,
1308                                                page_started, nr_written, 1);
1309                         if (ret) {
1310                                 btrfs_abort_transaction(trans, root, ret);
1311                                 goto error;
1312                         }
1313                         cow_start = (u64)-1;
1314                 }
1315
1316                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1317                         struct extent_map *em;
1318                         struct extent_map_tree *em_tree;
1319                         em_tree = &BTRFS_I(inode)->extent_tree;
1320                         em = alloc_extent_map();
1321                         BUG_ON(!em); /* -ENOMEM */
1322                         em->start = cur_offset;
1323                         em->orig_start = em->start;
1324                         em->len = num_bytes;
1325                         em->block_len = num_bytes;
1326                         em->block_start = disk_bytenr;
1327                         em->orig_block_len = disk_num_bytes;
1328                         em->bdev = root->fs_info->fs_devices->latest_bdev;
1329                         set_bit(EXTENT_FLAG_PINNED, &em->flags);
1330                         set_bit(EXTENT_FLAG_FILLING, &em->flags);
1331                         while (1) {
1332                                 write_lock(&em_tree->lock);
1333                                 ret = add_extent_mapping(em_tree, em);
1334                                 write_unlock(&em_tree->lock);
1335                                 if (ret != -EEXIST) {
1336                                         free_extent_map(em);
1337                                         break;
1338                                 }
1339                                 btrfs_drop_extent_cache(inode, em->start,
1340                                                 em->start + em->len - 1, 0);
1341                         }
1342                         type = BTRFS_ORDERED_PREALLOC;
1343                 } else {
1344                         type = BTRFS_ORDERED_NOCOW;
1345                 }
1346
1347                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1348                                                num_bytes, num_bytes, type);
1349                 BUG_ON(ret); /* -ENOMEM */
1350
1351                 if (root->root_key.objectid ==
1352                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1353                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1354                                                       num_bytes);
1355                         if (ret) {
1356                                 btrfs_abort_transaction(trans, root, ret);
1357                                 goto error;
1358                         }
1359                 }
1360
1361                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1362                                 cur_offset, cur_offset + num_bytes - 1,
1363                                 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
1364                                 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
1365                                 EXTENT_SET_PRIVATE2);
1366                 cur_offset = extent_end;
1367                 if (cur_offset > end)
1368                         break;
1369         }
1370         btrfs_release_path(path);
1371
1372         if (cur_offset <= end && cow_start == (u64)-1) {
1373                 cow_start = cur_offset;
1374                 cur_offset = end;
1375         }
1376
1377         if (cow_start != (u64)-1) {
1378                 ret = __cow_file_range(trans, inode, root, locked_page,
1379                                        cow_start, end,
1380                                        page_started, nr_written, 1);
1381                 if (ret) {
1382                         btrfs_abort_transaction(trans, root, ret);
1383                         goto error;
1384                 }
1385         }
1386
1387 error:
1388         err = btrfs_end_transaction(trans, root);
1389         if (!ret)
1390                 ret = err;
1391
1392         if (ret && cur_offset < end)
1393                 extent_clear_unlock_delalloc(inode,
1394                              &BTRFS_I(inode)->io_tree,
1395                              cur_offset, end, locked_page,
1396                              EXTENT_CLEAR_UNLOCK_PAGE |
1397                              EXTENT_CLEAR_UNLOCK |
1398                              EXTENT_CLEAR_DELALLOC |
1399                              EXTENT_CLEAR_DIRTY |
1400                              EXTENT_SET_WRITEBACK |
1401                              EXTENT_END_WRITEBACK);
1402
1403         btrfs_free_path(path);
1404         return ret;
1405 }
1406
1407 /*
1408  * extent_io.c call back to do delayed allocation processing
1409  */
1410 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1411                               u64 start, u64 end, int *page_started,
1412                               unsigned long *nr_written)
1413 {
1414         int ret;
1415         struct btrfs_root *root = BTRFS_I(inode)->root;
1416
1417         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
1418                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1419                                          page_started, 1, nr_written);
1420         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
1421                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1422                                          page_started, 0, nr_written);
1423         } else if (!btrfs_test_opt(root, COMPRESS) &&
1424                    !(BTRFS_I(inode)->force_compress) &&
1425                    !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
1426                 ret = cow_file_range(inode, locked_page, start, end,
1427                                       page_started, nr_written, 1);
1428         } else {
1429                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1430                         &BTRFS_I(inode)->runtime_flags);
1431                 ret = cow_file_range_async(inode, locked_page, start, end,
1432                                            page_started, nr_written);
1433         }
1434         return ret;
1435 }
1436
1437 static void btrfs_split_extent_hook(struct inode *inode,
1438                                     struct extent_state *orig, u64 split)
1439 {
1440         /* not delalloc, ignore it */
1441         if (!(orig->state & EXTENT_DELALLOC))
1442                 return;
1443
1444         spin_lock(&BTRFS_I(inode)->lock);
1445         BTRFS_I(inode)->outstanding_extents++;
1446         spin_unlock(&BTRFS_I(inode)->lock);
1447 }
1448
1449 /*
1450  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1451  * extents so we can keep track of new extents that are just merged onto old
1452  * extents, such as when we are doing sequential writes, so we can properly
1453  * account for the metadata space we'll need.
1454  */
1455 static void btrfs_merge_extent_hook(struct inode *inode,
1456                                     struct extent_state *new,
1457                                     struct extent_state *other)
1458 {
1459         /* not delalloc, ignore it */
1460         if (!(other->state & EXTENT_DELALLOC))
1461                 return;
1462
1463         spin_lock(&BTRFS_I(inode)->lock);
1464         BTRFS_I(inode)->outstanding_extents--;
1465         spin_unlock(&BTRFS_I(inode)->lock);
1466 }
1467
1468 /*
1469  * extent_io.c set_bit_hook, used to track delayed allocation
1470  * bytes in this file, and to maintain the list of inodes that
1471  * have pending delalloc work to be done.
1472  */
1473 static void btrfs_set_bit_hook(struct inode *inode,
1474                                struct extent_state *state, int *bits)
1475 {
1476
1477         /*
1478          * set_bit and clear bit hooks normally require _irqsave/restore
1479          * but in this case, we are only testing for the DELALLOC
1480          * bit, which is only set or cleared with irqs on
1481          */
1482         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1483                 struct btrfs_root *root = BTRFS_I(inode)->root;
1484                 u64 len = state->end + 1 - state->start;
1485                 bool do_list = !btrfs_is_free_space_inode(inode);
1486
1487                 if (*bits & EXTENT_FIRST_DELALLOC) {
1488                         *bits &= ~EXTENT_FIRST_DELALLOC;
1489                 } else {
1490                         spin_lock(&BTRFS_I(inode)->lock);
1491                         BTRFS_I(inode)->outstanding_extents++;
1492                         spin_unlock(&BTRFS_I(inode)->lock);
1493                 }
1494
1495                 spin_lock(&root->fs_info->delalloc_lock);
1496                 BTRFS_I(inode)->delalloc_bytes += len;
1497                 root->fs_info->delalloc_bytes += len;
1498                 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1499                         list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1500                                       &root->fs_info->delalloc_inodes);
1501                 }
1502                 spin_unlock(&root->fs_info->delalloc_lock);
1503         }
1504 }
1505
1506 /*
1507  * extent_io.c clear_bit_hook, see set_bit_hook for why
1508  */
1509 static void btrfs_clear_bit_hook(struct inode *inode,
1510                                  struct extent_state *state, int *bits)
1511 {
1512         /*
1513          * set_bit and clear bit hooks normally require _irqsave/restore
1514          * but in this case, we are only testing for the DELALLOC
1515          * bit, which is only set or cleared with irqs on
1516          */
1517         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1518                 struct btrfs_root *root = BTRFS_I(inode)->root;
1519                 u64 len = state->end + 1 - state->start;
1520                 bool do_list = !btrfs_is_free_space_inode(inode);
1521
1522                 if (*bits & EXTENT_FIRST_DELALLOC) {
1523                         *bits &= ~EXTENT_FIRST_DELALLOC;
1524                 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1525                         spin_lock(&BTRFS_I(inode)->lock);
1526                         BTRFS_I(inode)->outstanding_extents--;
1527                         spin_unlock(&BTRFS_I(inode)->lock);
1528                 }
1529
1530                 if (*bits & EXTENT_DO_ACCOUNTING)
1531                         btrfs_delalloc_release_metadata(inode, len);
1532
1533                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1534                     && do_list)
1535                         btrfs_free_reserved_data_space(inode, len);
1536
1537                 spin_lock(&root->fs_info->delalloc_lock);
1538                 root->fs_info->delalloc_bytes -= len;
1539                 BTRFS_I(inode)->delalloc_bytes -= len;
1540
1541                 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1542                     !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1543                         list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1544                 }
1545                 spin_unlock(&root->fs_info->delalloc_lock);
1546         }
1547 }
1548
1549 /*
1550  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1551  * we don't create bios that span stripes or chunks
1552  */
1553 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1554                          size_t size, struct bio *bio,
1555                          unsigned long bio_flags)
1556 {
1557         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1558         u64 logical = (u64)bio->bi_sector << 9;
1559         u64 length = 0;
1560         u64 map_length;
1561         int ret;
1562
1563         if (bio_flags & EXTENT_BIO_COMPRESSED)
1564                 return 0;
1565
1566         length = bio->bi_size;
1567         map_length = length;
1568         ret = btrfs_map_block(root->fs_info, READ, logical,
1569                               &map_length, NULL, 0);
1570         /* Will always return 0 with map_multi == NULL */
1571         BUG_ON(ret < 0);
1572         if (map_length < length + size)
1573                 return 1;
1574         return 0;
1575 }
1576
1577 /*
1578  * in order to insert checksums into the metadata in large chunks,
1579  * we wait until bio submission time.   All the pages in the bio are
1580  * checksummed and sums are attached onto the ordered extent record.
1581  *
1582  * At IO completion time the cums attached on the ordered extent record
1583  * are inserted into the btree
1584  */
1585 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1586                                     struct bio *bio, int mirror_num,
1587                                     unsigned long bio_flags,
1588                                     u64 bio_offset)
1589 {
1590         struct btrfs_root *root = BTRFS_I(inode)->root;
1591         int ret = 0;
1592
1593         ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1594         BUG_ON(ret); /* -ENOMEM */
1595         return 0;
1596 }
1597
1598 /*
1599  * in order to insert checksums into the metadata in large chunks,
1600  * we wait until bio submission time.   All the pages in the bio are
1601  * checksummed and sums are attached onto the ordered extent record.
1602  *
1603  * At IO completion time the cums attached on the ordered extent record
1604  * are inserted into the btree
1605  */
1606 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1607                           int mirror_num, unsigned long bio_flags,
1608                           u64 bio_offset)
1609 {
1610         struct btrfs_root *root = BTRFS_I(inode)->root;
1611         int ret;
1612
1613         ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1614         if (ret)
1615                 bio_endio(bio, ret);
1616         return ret;
1617 }
1618
1619 /*
1620  * extent_io.c submission hook. This does the right thing for csum calculation
1621  * on write, or reading the csums from the tree before a read
1622  */
1623 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1624                           int mirror_num, unsigned long bio_flags,
1625                           u64 bio_offset)
1626 {
1627         struct btrfs_root *root = BTRFS_I(inode)->root;
1628         int ret = 0;
1629         int skip_sum;
1630         int metadata = 0;
1631         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1632
1633         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1634
1635         if (btrfs_is_free_space_inode(inode))
1636                 metadata = 2;
1637
1638         if (!(rw & REQ_WRITE)) {
1639                 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1640                 if (ret)
1641                         goto out;
1642
1643                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1644                         ret = btrfs_submit_compressed_read(inode, bio,
1645                                                            mirror_num,
1646                                                            bio_flags);
1647                         goto out;
1648                 } else if (!skip_sum) {
1649                         ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1650                         if (ret)
1651                                 goto out;
1652                 }
1653                 goto mapit;
1654         } else if (async && !skip_sum) {
1655                 /* csum items have already been cloned */
1656                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1657                         goto mapit;
1658                 /* we're doing a write, do the async checksumming */
1659                 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1660                                    inode, rw, bio, mirror_num,
1661                                    bio_flags, bio_offset,
1662                                    __btrfs_submit_bio_start,
1663                                    __btrfs_submit_bio_done);
1664                 goto out;
1665         } else if (!skip_sum) {
1666                 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1667                 if (ret)
1668                         goto out;
1669         }
1670
1671 mapit:
1672         ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1673
1674 out:
1675         if (ret < 0)
1676                 bio_endio(bio, ret);
1677         return ret;
1678 }
1679
1680 /*
1681  * given a list of ordered sums record them in the inode.  This happens
1682  * at IO completion time based on sums calculated at bio submission time.
1683  */
1684 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1685                              struct inode *inode, u64 file_offset,
1686                              struct list_head *list)
1687 {
1688         struct btrfs_ordered_sum *sum;
1689
1690         list_for_each_entry(sum, list, list) {
1691                 btrfs_csum_file_blocks(trans,
1692                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
1693         }
1694         return 0;
1695 }
1696
1697 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1698                               struct extent_state **cached_state)
1699 {
1700         WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1701         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1702                                    cached_state, GFP_NOFS);
1703 }
1704
1705 /* see btrfs_writepage_start_hook for details on why this is required */
1706 struct btrfs_writepage_fixup {
1707         struct page *page;
1708         struct btrfs_work work;
1709 };
1710
1711 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1712 {
1713         struct btrfs_writepage_fixup *fixup;
1714         struct btrfs_ordered_extent *ordered;
1715         struct extent_state *cached_state = NULL;
1716         struct page *page;
1717         struct inode *inode;
1718         u64 page_start;
1719         u64 page_end;
1720         int ret;
1721
1722         fixup = container_of(work, struct btrfs_writepage_fixup, work);
1723         page = fixup->page;
1724 again:
1725         lock_page(page);
1726         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1727                 ClearPageChecked(page);
1728                 goto out_page;
1729         }
1730
1731         inode = page->mapping->host;
1732         page_start = page_offset(page);
1733         page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1734
1735         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1736                          &cached_state);
1737
1738         /* already ordered? We're done */
1739         if (PagePrivate2(page))
1740                 goto out;
1741
1742         ordered = btrfs_lookup_ordered_extent(inode, page_start);
1743         if (ordered) {
1744                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1745                                      page_end, &cached_state, GFP_NOFS);
1746                 unlock_page(page);
1747                 btrfs_start_ordered_extent(inode, ordered, 1);
1748                 btrfs_put_ordered_extent(ordered);
1749                 goto again;
1750         }
1751
1752         ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
1753         if (ret) {
1754                 mapping_set_error(page->mapping, ret);
1755                 end_extent_writepage(page, ret, page_start, page_end);
1756                 ClearPageChecked(page);
1757                 goto out;
1758          }
1759
1760         btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1761         ClearPageChecked(page);
1762         set_page_dirty(page);
1763 out:
1764         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1765                              &cached_state, GFP_NOFS);
1766 out_page:
1767         unlock_page(page);
1768         page_cache_release(page);
1769         kfree(fixup);
1770 }
1771
1772 /*
1773  * There are a few paths in the higher layers of the kernel that directly
1774  * set the page dirty bit without asking the filesystem if it is a
1775  * good idea.  This causes problems because we want to make sure COW
1776  * properly happens and the data=ordered rules are followed.
1777  *
1778  * In our case any range that doesn't have the ORDERED bit set
1779  * hasn't been properly setup for IO.  We kick off an async process
1780  * to fix it up.  The async helper will wait for ordered extents, set
1781  * the delalloc bit and make it safe to write the page.
1782  */
1783 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1784 {
1785         struct inode *inode = page->mapping->host;
1786         struct btrfs_writepage_fixup *fixup;
1787         struct btrfs_root *root = BTRFS_I(inode)->root;
1788
1789         /* this page is properly in the ordered list */
1790         if (TestClearPagePrivate2(page))
1791                 return 0;
1792
1793         if (PageChecked(page))
1794                 return -EAGAIN;
1795
1796         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1797         if (!fixup)
1798                 return -EAGAIN;
1799
1800         SetPageChecked(page);
1801         page_cache_get(page);
1802         fixup->work.func = btrfs_writepage_fixup_worker;
1803         fixup->page = page;
1804         btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1805         return -EBUSY;
1806 }
1807
1808 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1809                                        struct inode *inode, u64 file_pos,
1810                                        u64 disk_bytenr, u64 disk_num_bytes,
1811                                        u64 num_bytes, u64 ram_bytes,
1812                                        u8 compression, u8 encryption,
1813                                        u16 other_encoding, int extent_type)
1814 {
1815         struct btrfs_root *root = BTRFS_I(inode)->root;
1816         struct btrfs_file_extent_item *fi;
1817         struct btrfs_path *path;
1818         struct extent_buffer *leaf;
1819         struct btrfs_key ins;
1820         int ret;
1821
1822         path = btrfs_alloc_path();
1823         if (!path)
1824                 return -ENOMEM;
1825
1826         path->leave_spinning = 1;
1827
1828         /*
1829          * we may be replacing one extent in the tree with another.
1830          * The new extent is pinned in the extent map, and we don't want
1831          * to drop it from the cache until it is completely in the btree.
1832          *
1833          * So, tell btrfs_drop_extents to leave this extent in the cache.
1834          * the caller is expected to unpin it and allow it to be merged
1835          * with the others.
1836          */
1837         ret = btrfs_drop_extents(trans, root, inode, file_pos,
1838                                  file_pos + num_bytes, 0);
1839         if (ret)
1840                 goto out;
1841
1842         ins.objectid = btrfs_ino(inode);
1843         ins.offset = file_pos;
1844         ins.type = BTRFS_EXTENT_DATA_KEY;
1845         ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1846         if (ret)
1847                 goto out;
1848         leaf = path->nodes[0];
1849         fi = btrfs_item_ptr(leaf, path->slots[0],
1850                             struct btrfs_file_extent_item);
1851         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1852         btrfs_set_file_extent_type(leaf, fi, extent_type);
1853         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1854         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1855         btrfs_set_file_extent_offset(leaf, fi, 0);
1856         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1857         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1858         btrfs_set_file_extent_compression(leaf, fi, compression);
1859         btrfs_set_file_extent_encryption(leaf, fi, encryption);
1860         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1861
1862         btrfs_mark_buffer_dirty(leaf);
1863         btrfs_release_path(path);
1864
1865         inode_add_bytes(inode, num_bytes);
1866
1867         ins.objectid = disk_bytenr;
1868         ins.offset = disk_num_bytes;
1869         ins.type = BTRFS_EXTENT_ITEM_KEY;
1870         ret = btrfs_alloc_reserved_file_extent(trans, root,
1871                                         root->root_key.objectid,
1872                                         btrfs_ino(inode), file_pos, &ins);
1873 out:
1874         btrfs_free_path(path);
1875
1876         return ret;
1877 }
1878
1879 /*
1880  * helper function for btrfs_finish_ordered_io, this
1881  * just reads in some of the csum leaves to prime them into ram
1882  * before we start the transaction.  It limits the amount of btree
1883  * reads required while inside the transaction.
1884  */
1885 /* as ordered data IO finishes, this gets called so we can finish
1886  * an ordered extent if the range of bytes in the file it covers are
1887  * fully written.
1888  */
1889 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1890 {
1891         struct inode *inode = ordered_extent->inode;
1892         struct btrfs_root *root = BTRFS_I(inode)->root;
1893         struct btrfs_trans_handle *trans = NULL;
1894         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1895         struct extent_state *cached_state = NULL;
1896         int compress_type = 0;
1897         int ret;
1898         bool nolock;
1899
1900         nolock = btrfs_is_free_space_inode(inode);
1901
1902         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
1903                 ret = -EIO;
1904                 goto out;
1905         }
1906
1907         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1908                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
1909                 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1910                 if (!ret) {
1911                         if (nolock)
1912                                 trans = btrfs_join_transaction_nolock(root);
1913                         else
1914                                 trans = btrfs_join_transaction(root);
1915                         if (IS_ERR(trans)) {
1916                                 ret = PTR_ERR(trans);
1917                                 trans = NULL;
1918                                 goto out;
1919                         }
1920                         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1921                         ret = btrfs_update_inode_fallback(trans, root, inode);
1922                         if (ret) /* -ENOMEM or corruption */
1923                                 btrfs_abort_transaction(trans, root, ret);
1924                 }
1925                 goto out;
1926         }
1927
1928         lock_extent_bits(io_tree, ordered_extent->file_offset,
1929                          ordered_extent->file_offset + ordered_extent->len - 1,
1930                          0, &cached_state);
1931
1932         if (nolock)
1933                 trans = btrfs_join_transaction_nolock(root);
1934         else
1935                 trans = btrfs_join_transaction(root);
1936         if (IS_ERR(trans)) {
1937                 ret = PTR_ERR(trans);
1938                 trans = NULL;
1939                 goto out_unlock;
1940         }
1941         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1942
1943         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1944                 compress_type = ordered_extent->compress_type;
1945         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1946                 BUG_ON(compress_type);
1947                 ret = btrfs_mark_extent_written(trans, inode,
1948                                                 ordered_extent->file_offset,
1949                                                 ordered_extent->file_offset +
1950                                                 ordered_extent->len);
1951         } else {
1952                 BUG_ON(root == root->fs_info->tree_root);
1953                 ret = insert_reserved_file_extent(trans, inode,
1954                                                 ordered_extent->file_offset,
1955                                                 ordered_extent->start,
1956                                                 ordered_extent->disk_len,
1957                                                 ordered_extent->len,
1958                                                 ordered_extent->len,
1959                                                 compress_type, 0, 0,
1960                                                 BTRFS_FILE_EXTENT_REG);
1961         }
1962         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1963                            ordered_extent->file_offset, ordered_extent->len,
1964                            trans->transid);
1965         if (ret < 0) {
1966                 btrfs_abort_transaction(trans, root, ret);
1967                 goto out_unlock;
1968         }
1969
1970         add_pending_csums(trans, inode, ordered_extent->file_offset,
1971                           &ordered_extent->list);
1972
1973         ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1974         if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1975                 ret = btrfs_update_inode_fallback(trans, root, inode);
1976                 if (ret) { /* -ENOMEM or corruption */
1977                         btrfs_abort_transaction(trans, root, ret);
1978                         goto out_unlock;
1979                 }
1980         } else {
1981                 btrfs_set_inode_last_trans(trans, inode);
1982         }
1983         ret = 0;
1984 out_unlock:
1985         unlock_extent_cached(io_tree, ordered_extent->file_offset,
1986                              ordered_extent->file_offset +
1987                              ordered_extent->len - 1, &cached_state, GFP_NOFS);
1988 out:
1989         if (root != root->fs_info->tree_root)
1990                 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1991         if (trans)
1992                 btrfs_end_transaction(trans, root);
1993
1994         if (ret)
1995                 clear_extent_uptodate(io_tree, ordered_extent->file_offset,
1996                                       ordered_extent->file_offset +
1997                                       ordered_extent->len - 1, NULL, GFP_NOFS);
1998
1999         /*
2000          * This needs to be done to make sure anybody waiting knows we are done
2001          * updating everything for this ordered extent.
2002          */
2003         btrfs_remove_ordered_extent(inode, ordered_extent);
2004
2005         /* once for us */
2006         btrfs_put_ordered_extent(ordered_extent);
2007         /* once for the tree */
2008         btrfs_put_ordered_extent(ordered_extent);
2009
2010         return ret;
2011 }
2012
2013 static void finish_ordered_fn(struct btrfs_work *work)
2014 {
2015         struct btrfs_ordered_extent *ordered_extent;
2016         ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
2017         btrfs_finish_ordered_io(ordered_extent);
2018 }
2019
2020 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2021                                 struct extent_state *state, int uptodate)
2022 {
2023         struct inode *inode = page->mapping->host;
2024         struct btrfs_root *root = BTRFS_I(inode)->root;
2025         struct btrfs_ordered_extent *ordered_extent = NULL;
2026         struct btrfs_workers *workers;
2027
2028         trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2029
2030         ClearPagePrivate2(page);
2031         if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
2032                                             end - start + 1, uptodate))
2033                 return 0;
2034
2035         ordered_extent->work.func = finish_ordered_fn;
2036         ordered_extent->work.flags = 0;
2037
2038         if (btrfs_is_free_space_inode(inode))
2039                 workers = &root->fs_info->endio_freespace_worker;
2040         else
2041                 workers = &root->fs_info->endio_write_workers;
2042         btrfs_queue_worker(workers, &ordered_extent->work);
2043
2044         return 0;
2045 }
2046
2047 /*
2048  * when reads are done, we need to check csums to verify the data is correct
2049  * if there's a match, we allow the bio to finish.  If not, the code in
2050  * extent_io.c will try to find good copies for us.
2051  */
2052 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2053                                struct extent_state *state, int mirror)
2054 {
2055         size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
2056         struct inode *inode = page->mapping->host;
2057         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2058         char *kaddr;
2059         u64 private = ~(u32)0;
2060         int ret;
2061         struct btrfs_root *root = BTRFS_I(inode)->root;
2062         u32 csum = ~(u32)0;
2063
2064         if (PageChecked(page)) {
2065                 ClearPageChecked(page);
2066                 goto good;
2067         }
2068
2069         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2070                 goto good;
2071
2072         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2073             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
2074                 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
2075                                   GFP_NOFS);
2076                 return 0;
2077         }
2078
2079         if (state && state->start == start) {
2080                 private = state->private;
2081                 ret = 0;
2082         } else {
2083                 ret = get_state_private(io_tree, start, &private);
2084         }
2085         kaddr = kmap_atomic(page);
2086         if (ret)
2087                 goto zeroit;
2088
2089         csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
2090         btrfs_csum_final(csum, (char *)&csum);
2091         if (csum != private)
2092                 goto zeroit;
2093
2094         kunmap_atomic(kaddr);
2095 good:
2096         return 0;
2097
2098 zeroit:
2099         printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u "
2100                        "private %llu\n",
2101                        (unsigned long long)btrfs_ino(page->mapping->host),
2102                        (unsigned long long)start, csum,
2103                        (unsigned long long)private);
2104         memset(kaddr + offset, 1, end - start + 1);
2105         flush_dcache_page(page);
2106         kunmap_atomic(kaddr);
2107         if (private == 0)
2108                 return 0;
2109         return -EIO;
2110 }
2111
2112 struct delayed_iput {
2113         struct list_head list;
2114         struct inode *inode;
2115 };
2116
2117 /* JDM: If this is fs-wide, why can't we add a pointer to
2118  * btrfs_inode instead and avoid the allocation? */
2119 void btrfs_add_delayed_iput(struct inode *inode)
2120 {
2121         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2122         struct delayed_iput *delayed;
2123
2124         if (atomic_add_unless(&inode->i_count, -1, 1))
2125                 return;
2126
2127         delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
2128         delayed->inode = inode;
2129
2130         spin_lock(&fs_info->delayed_iput_lock);
2131         list_add_tail(&delayed->list, &fs_info->delayed_iputs);
2132         spin_unlock(&fs_info->delayed_iput_lock);
2133 }
2134
2135 void btrfs_run_delayed_iputs(struct btrfs_root *root)
2136 {
2137         LIST_HEAD(list);
2138         struct btrfs_fs_info *fs_info = root->fs_info;
2139         struct delayed_iput *delayed;
2140         int empty;
2141
2142         spin_lock(&fs_info->delayed_iput_lock);
2143         empty = list_empty(&fs_info->delayed_iputs);
2144         spin_unlock(&fs_info->delayed_iput_lock);
2145         if (empty)
2146                 return;
2147
2148         spin_lock(&fs_info->delayed_iput_lock);
2149         list_splice_init(&fs_info->delayed_iputs, &list);
2150         spin_unlock(&fs_info->delayed_iput_lock);
2151
2152         while (!list_empty(&list)) {
2153                 delayed = list_entry(list.next, struct delayed_iput, list);
2154                 list_del(&delayed->list);
2155                 iput(delayed->inode);
2156                 kfree(delayed);
2157         }
2158 }
2159
2160 enum btrfs_orphan_cleanup_state {
2161         ORPHAN_CLEANUP_STARTED  = 1,
2162         ORPHAN_CLEANUP_DONE     = 2,
2163 };
2164
2165 /*
2166  * This is called in transaction commit time. If there are no orphan
2167  * files in the subvolume, it removes orphan item and frees block_rsv
2168  * structure.
2169  */
2170 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2171                               struct btrfs_root *root)
2172 {
2173         struct btrfs_block_rsv *block_rsv;
2174         int ret;
2175
2176         if (atomic_read(&root->orphan_inodes) ||
2177             root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2178                 return;
2179
2180         spin_lock(&root->orphan_lock);
2181         if (atomic_read(&root->orphan_inodes)) {
2182                 spin_unlock(&root->orphan_lock);
2183                 return;
2184         }
2185
2186         if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
2187                 spin_unlock(&root->orphan_lock);
2188                 return;
2189         }
2190
2191         block_rsv = root->orphan_block_rsv;
2192         root->orphan_block_rsv = NULL;
2193         spin_unlock(&root->orphan_lock);
2194
2195         if (root->orphan_item_inserted &&
2196             btrfs_root_refs(&root->root_item) > 0) {
2197                 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2198                                             root->root_key.objectid);
2199                 BUG_ON(ret);
2200                 root->orphan_item_inserted = 0;
2201         }
2202
2203         if (block_rsv) {
2204                 WARN_ON(block_rsv->size > 0);
2205                 btrfs_free_block_rsv(root, block_rsv);
2206         }
2207 }
2208
2209 /*
2210  * This creates an orphan entry for the given inode in case something goes
2211  * wrong in the middle of an unlink/truncate.
2212  *
2213  * NOTE: caller of this function should reserve 5 units of metadata for
2214  *       this function.
2215  */
2216 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2217 {
2218         struct btrfs_root *root = BTRFS_I(inode)->root;
2219         struct btrfs_block_rsv *block_rsv = NULL;
2220         int reserve = 0;
2221         int insert = 0;
2222         int ret;
2223
2224         if (!root->orphan_block_rsv) {
2225                 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2226                 if (!block_rsv)
2227                         return -ENOMEM;
2228         }
2229
2230         spin_lock(&root->orphan_lock);
2231         if (!root->orphan_block_rsv) {
2232                 root->orphan_block_rsv = block_rsv;
2233         } else if (block_rsv) {
2234                 btrfs_free_block_rsv(root, block_rsv);
2235                 block_rsv = NULL;
2236         }
2237
2238         if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2239                               &BTRFS_I(inode)->runtime_flags)) {
2240 #if 0
2241                 /*
2242                  * For proper ENOSPC handling, we should do orphan
2243                  * cleanup when mounting. But this introduces backward
2244                  * compatibility issue.
2245                  */
2246                 if (!xchg(&root->orphan_item_inserted, 1))
2247                         insert = 2;
2248                 else
2249                         insert = 1;
2250 #endif
2251                 insert = 1;
2252                 atomic_inc(&root->orphan_inodes);
2253         }
2254
2255         if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2256                               &BTRFS_I(inode)->runtime_flags))
2257                 reserve = 1;
2258         spin_unlock(&root->orphan_lock);
2259
2260         /* grab metadata reservation from transaction handle */
2261         if (reserve) {
2262                 ret = btrfs_orphan_reserve_metadata(trans, inode);
2263                 BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
2264         }
2265
2266         /* insert an orphan item to track this unlinked/truncated file */
2267         if (insert >= 1) {
2268                 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2269                 if (ret && ret != -EEXIST) {
2270                         clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2271                                   &BTRFS_I(inode)->runtime_flags);
2272                         btrfs_abort_transaction(trans, root, ret);
2273                         return ret;
2274                 }
2275                 ret = 0;
2276         }
2277
2278         /* insert an orphan item to track subvolume contains orphan files */
2279         if (insert >= 2) {
2280                 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2281                                                root->root_key.objectid);
2282                 if (ret && ret != -EEXIST) {
2283                         btrfs_abort_transaction(trans, root, ret);
2284                         return ret;
2285                 }
2286         }
2287         return 0;
2288 }
2289
2290 /*
2291  * We have done the truncate/delete so we can go ahead and remove the orphan
2292  * item for this particular inode.
2293  */
2294 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2295 {
2296         struct btrfs_root *root = BTRFS_I(inode)->root;
2297         int delete_item = 0;
2298         int release_rsv = 0;
2299         int ret = 0;
2300
2301         spin_lock(&root->orphan_lock);
2302         if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2303                                &BTRFS_I(inode)->runtime_flags))
2304                 delete_item = 1;
2305
2306         if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2307                                &BTRFS_I(inode)->runtime_flags))
2308                 release_rsv = 1;
2309         spin_unlock(&root->orphan_lock);
2310
2311         if (trans && delete_item) {
2312                 ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
2313                 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
2314         }
2315
2316         if (release_rsv) {
2317                 btrfs_orphan_release_metadata(inode);
2318                 atomic_dec(&root->orphan_inodes);
2319         }
2320
2321         return 0;
2322 }
2323
2324 /*
2325  * this cleans up any orphans that may be left on the list from the last use
2326  * of this root.
2327  */
2328 int btrfs_orphan_cleanup(struct btrfs_root *root)
2329 {
2330         struct btrfs_path *path;
2331         struct extent_buffer *leaf;
2332         struct btrfs_key key, found_key;
2333         struct btrfs_trans_handle *trans;
2334         struct inode *inode;
2335         u64 last_objectid = 0;
2336         int ret = 0, nr_unlink = 0, nr_truncate = 0;
2337
2338         if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2339                 return 0;
2340
2341         path = btrfs_alloc_path();
2342         if (!path) {
2343                 ret = -ENOMEM;
2344                 goto out;
2345         }
2346         path->reada = -1;
2347
2348         key.objectid = BTRFS_ORPHAN_OBJECTID;
2349         btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
2350         key.offset = (u64)-1;
2351
2352         while (1) {
2353                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2354                 if (ret < 0)
2355                         goto out;
2356
2357                 /*
2358                  * if ret == 0 means we found what we were searching for, which
2359                  * is weird, but possible, so only screw with path if we didn't
2360                  * find the key and see if we have stuff that matches
2361                  */
2362                 if (ret > 0) {
2363                         ret = 0;
2364                         if (path->slots[0] == 0)
2365                                 break;
2366                         path->slots[0]--;
2367                 }
2368
2369                 /* pull out the item */
2370                 leaf = path->nodes[0];
2371                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2372
2373                 /* make sure the item matches what we want */
2374                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
2375                         break;
2376                 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
2377                         break;
2378
2379                 /* release the path since we're done with it */
2380                 btrfs_release_path(path);
2381
2382                 /*
2383                  * this is where we are basically btrfs_lookup, without the
2384                  * crossing root thing.  we store the inode number in the
2385                  * offset of the orphan item.
2386                  */
2387
2388                 if (found_key.offset == last_objectid) {
2389                         printk(KERN_ERR "btrfs: Error removing orphan entry, "
2390                                "stopping orphan cleanup\n");
2391                         ret = -EINVAL;
2392                         goto out;
2393                 }
2394
2395                 last_objectid = found_key.offset;
2396
2397                 found_key.objectid = found_key.offset;
2398                 found_key.type = BTRFS_INODE_ITEM_KEY;
2399                 found_key.offset = 0;
2400                 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2401                 ret = PTR_RET(inode);
2402                 if (ret && ret != -ESTALE)
2403                         goto out;
2404
2405                 if (ret == -ESTALE && root == root->fs_info->tree_root) {
2406                         struct btrfs_root *dead_root;
2407                         struct btrfs_fs_info *fs_info = root->fs_info;
2408                         int is_dead_root = 0;
2409
2410                         /*
2411                          * this is an orphan in the tree root. Currently these
2412                          * could come from 2 sources:
2413                          *  a) a snapshot deletion in progress
2414                          *  b) a free space cache inode
2415                          * We need to distinguish those two, as the snapshot
2416                          * orphan must not get deleted.
2417                          * find_dead_roots already ran before us, so if this
2418                          * is a snapshot deletion, we should find the root
2419                          * in the dead_roots list
2420                          */
2421                         spin_lock(&fs_info->trans_lock);
2422                         list_for_each_entry(dead_root, &fs_info->dead_roots,
2423                                             root_list) {
2424                                 if (dead_root->root_key.objectid ==
2425                                     found_key.objectid) {
2426                                         is_dead_root = 1;
2427                                         break;
2428                                 }
2429                         }
2430                         spin_unlock(&fs_info->trans_lock);
2431                         if (is_dead_root) {
2432                                 /* prevent this orphan from being found again */
2433                                 key.offset = found_key.objectid - 1;
2434                                 continue;
2435                         }
2436                 }
2437                 /*
2438                  * Inode is already gone but the orphan item is still there,
2439                  * kill the orphan item.
2440                  */
2441                 if (ret == -ESTALE) {
2442                         trans = btrfs_start_transaction(root, 1);
2443                         if (IS_ERR(trans)) {
2444                                 ret = PTR_ERR(trans);
2445                                 goto out;
2446                         }
2447                         printk(KERN_ERR "auto deleting %Lu\n",
2448                                found_key.objectid);
2449                         ret = btrfs_del_orphan_item(trans, root,
2450                                                     found_key.objectid);
2451                         BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
2452                         btrfs_end_transaction(trans, root);
2453                         continue;
2454                 }
2455
2456                 /*
2457                  * add this inode to the orphan list so btrfs_orphan_del does
2458                  * the proper thing when we hit it
2459                  */
2460                 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2461                         &BTRFS_I(inode)->runtime_flags);
2462
2463                 /* if we have links, this was a truncate, lets do that */
2464                 if (inode->i_nlink) {
2465                         if (!S_ISREG(inode->i_mode)) {
2466                                 WARN_ON(1);
2467                                 iput(inode);
2468                                 continue;
2469                         }
2470                         nr_truncate++;
2471                         ret = btrfs_truncate(inode);
2472                 } else {
2473                         nr_unlink++;
2474                 }
2475
2476                 /* this will do delete_inode and everything for us */
2477                 iput(inode);
2478                 if (ret)
2479                         goto out;
2480         }
2481         /* release the path since we're done with it */
2482         btrfs_release_path(path);
2483
2484         root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2485
2486         if (root->orphan_block_rsv)
2487                 btrfs_block_rsv_release(root, root->orphan_block_rsv,
2488                                         (u64)-1);
2489
2490         if (root->orphan_block_rsv || root->orphan_item_inserted) {
2491                 trans = btrfs_join_transaction(root);
2492                 if (!IS_ERR(trans))
2493                         btrfs_end_transaction(trans, root);
2494         }
2495
2496         if (nr_unlink)
2497                 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2498         if (nr_truncate)
2499                 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2500
2501 out:
2502         if (ret)
2503                 printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
2504         btrfs_free_path(path);
2505         return ret;
2506 }
2507
2508 /*
2509  * very simple check to peek ahead in the leaf looking for xattrs.  If we
2510  * don't find any xattrs, we know there can't be any acls.
2511  *
2512  * slot is the slot the inode is in, objectid is the objectid of the inode
2513  */
2514 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
2515                                           int slot, u64 objectid)
2516 {
2517         u32 nritems = btrfs_header_nritems(leaf);
2518         struct btrfs_key found_key;
2519         int scanned = 0;
2520
2521         slot++;
2522         while (slot < nritems) {
2523                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2524
2525                 /* we found a different objectid, there must not be acls */
2526                 if (found_key.objectid != objectid)
2527                         return 0;
2528
2529                 /* we found an xattr, assume we've got an acl */
2530                 if (found_key.type == BTRFS_XATTR_ITEM_KEY)
2531                         return 1;
2532
2533                 /*
2534                  * we found a key greater than an xattr key, there can't
2535                  * be any acls later on
2536                  */
2537                 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
2538                         return 0;
2539
2540                 slot++;
2541                 scanned++;
2542
2543                 /*
2544                  * it goes inode, inode backrefs, xattrs, extents,
2545                  * so if there are a ton of hard links to an inode there can
2546                  * be a lot of backrefs.  Don't waste time searching too hard,
2547                  * this is just an optimization
2548                  */
2549                 if (scanned >= 8)
2550                         break;
2551         }
2552         /* we hit the end of the leaf before we found an xattr or
2553          * something larger than an xattr.  We have to assume the inode
2554          * has acls
2555          */
2556         return 1;
2557 }
2558
2559 /*
2560  * read an inode from the btree into the in-memory inode
2561  */
2562 static void btrfs_read_locked_inode(struct inode *inode)
2563 {
2564         struct btrfs_path *path;
2565         struct extent_buffer *leaf;
2566         struct btrfs_inode_item *inode_item;
2567         struct btrfs_timespec *tspec;
2568         struct btrfs_root *root = BTRFS_I(inode)->root;
2569         struct btrfs_key location;
2570         int maybe_acls;
2571         u32 rdev;
2572         int ret;
2573         bool filled = false;
2574
2575         ret = btrfs_fill_inode(inode, &rdev);
2576         if (!ret)
2577                 filled = true;
2578
2579         path = btrfs_alloc_path();
2580         if (!path)
2581                 goto make_bad;
2582
2583         path->leave_spinning = 1;
2584         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
2585
2586         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
2587         if (ret)
2588                 goto make_bad;
2589
2590         leaf = path->nodes[0];
2591
2592         if (filled)
2593                 goto cache_acl;
2594
2595         inode_item = btrfs_item_ptr(leaf, path->slots[0],
2596                                     struct btrfs_inode_item);
2597         inode->i_mode = btrfs_inode_mode(leaf, inode_item);
2598         set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
2599         i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
2600         i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
2601         btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
2602
2603         tspec = btrfs_inode_atime(inode_item);
2604         inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2605         inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2606
2607         tspec = btrfs_inode_mtime(inode_item);
2608         inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2609         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2610
2611         tspec = btrfs_inode_ctime(inode_item);
2612         inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2613         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2614
2615         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2616         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2617         BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
2618
2619         /*
2620          * If we were modified in the current generation and evicted from memory
2621          * and then re-read we need to do a full sync since we don't have any
2622          * idea about which extents were modified before we were evicted from
2623          * cache.
2624          */
2625         if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
2626                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2627                         &BTRFS_I(inode)->runtime_flags);
2628
2629         inode->i_version = btrfs_inode_sequence(leaf, inode_item);
2630         inode->i_generation = BTRFS_I(inode)->generation;
2631         inode->i_rdev = 0;
2632         rdev = btrfs_inode_rdev(leaf, inode_item);
2633
2634         BTRFS_I(inode)->index_cnt = (u64)-1;
2635         BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2636 cache_acl:
2637         /*
2638          * try to precache a NULL acl entry for files that don't have
2639          * any xattrs or acls
2640          */
2641         maybe_acls = acls_after_inode_item(leaf, path->slots[0],
2642                                            btrfs_ino(inode));
2643         if (!maybe_acls)
2644                 cache_no_acl(inode);
2645
2646         btrfs_free_path(path);
2647
2648         switch (inode->i_mode & S_IFMT) {
2649         case S_IFREG:
2650                 inode->i_mapping->a_ops = &btrfs_aops;
2651                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2652                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
2653                 inode->i_fop = &btrfs_file_operations;
2654                 inode->i_op = &btrfs_file_inode_operations;
2655                 break;
2656         case S_IFDIR:
2657                 inode->i_fop = &btrfs_dir_file_operations;
2658                 if (root == root->fs_info->tree_root)
2659                         inode->i_op = &btrfs_dir_ro_inode_operations;
2660                 else
2661                         inode->i_op = &btrfs_dir_inode_operations;
2662                 break;
2663         case S_IFLNK:
2664                 inode->i_op = &btrfs_symlink_inode_operations;
2665                 inode->i_mapping->a_ops = &btrfs_symlink_aops;
2666                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2667                 break;
2668         default:
2669                 inode->i_op = &btrfs_special_inode_operations;
2670                 init_special_inode(inode, inode->i_mode, rdev);
2671                 break;
2672         }
2673
2674         btrfs_update_iflags(inode);
2675         return;
2676
2677 make_bad:
2678         btrfs_free_path(path);
2679         make_bad_inode(inode);
2680 }
2681
2682 /*
2683  * given a leaf and an inode, copy the inode fields into the leaf
2684  */
2685 static void fill_inode_item(struct btrfs_trans_handle *trans,
2686                             struct extent_buffer *leaf,
2687                             struct btrfs_inode_item *item,
2688                             struct inode *inode)
2689 {
2690         btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
2691         btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
2692         btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2693         btrfs_set_inode_mode(leaf, item, inode->i_mode);
2694         btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2695
2696         btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2697                                inode->i_atime.tv_sec);
2698         btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2699                                 inode->i_atime.tv_nsec);
2700
2701         btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2702                                inode->i_mtime.tv_sec);
2703         btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2704                                 inode->i_mtime.tv_nsec);
2705
2706         btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2707                                inode->i_ctime.tv_sec);
2708         btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2709                                 inode->i_ctime.tv_nsec);
2710
2711         btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2712         btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2713         btrfs_set_inode_sequence(leaf, item, inode->i_version);
2714         btrfs_set_inode_transid(leaf, item, trans->transid);
2715         btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2716         btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2717         btrfs_set_inode_block_group(leaf, item, 0);
2718 }
2719
2720 /*
2721  * copy everything in the in-memory inode into the btree.
2722  */
2723 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
2724                                 struct btrfs_root *root, struct inode *inode)
2725 {
2726         struct btrfs_inode_item *inode_item;
2727         struct btrfs_path *path;
2728         struct extent_buffer *leaf;
2729         int ret;
2730
2731         path = btrfs_alloc_path();
2732         if (!path)
2733                 return -ENOMEM;
2734
2735         path->leave_spinning = 1;
2736         ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
2737                                  1);
2738         if (ret) {
2739                 if (ret > 0)
2740                         ret = -ENOENT;
2741                 goto failed;
2742         }
2743
2744         btrfs_unlock_up_safe(path, 1);
2745         leaf = path->nodes[0];
2746         inode_item = btrfs_item_ptr(leaf, path->slots[0],
2747                                     struct btrfs_inode_item);
2748
2749         fill_inode_item(trans, leaf, inode_item, inode);
2750         btrfs_mark_buffer_dirty(leaf);
2751         btrfs_set_inode_last_trans(trans, inode);
2752         ret = 0;
2753 failed:
2754         btrfs_free_path(path);
2755         return ret;
2756 }
2757
2758 /*
2759  * copy everything in the in-memory inode into the btree.
2760  */
2761 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2762                                 struct btrfs_root *root, struct inode *inode)
2763 {
2764         int ret;
2765
2766         /*
2767          * If the inode is a free space inode, we can deadlock during commit
2768          * if we put it into the delayed code.
2769          *
2770          * The data relocation inode should also be directly updated
2771          * without delay
2772          */
2773         if (!btrfs_is_free_space_inode(inode)
2774             && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2775                 btrfs_update_root_times(trans, root);
2776
2777                 ret = btrfs_delayed_update_inode(trans, root, inode);
2778                 if (!ret)
2779                         btrfs_set_inode_last_trans(trans, inode);
2780                 return ret;
2781         }
2782
2783         return btrfs_update_inode_item(trans, root, inode);
2784 }
2785
2786 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
2787                                          struct btrfs_root *root,
2788                                          struct inode *inode)
2789 {
2790         int ret;
2791
2792         ret = btrfs_update_inode(trans, root, inode);
2793         if (ret == -ENOSPC)
2794                 return btrfs_update_inode_item(trans, root, inode);
2795         return ret;
2796 }
2797
2798 /*
2799  * unlink helper that gets used here in inode.c and in the tree logging
2800  * recovery code.  It remove a link in a directory with a given name, and
2801  * also drops the back refs in the inode to the directory
2802  */
2803 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2804                                 struct btrfs_root *root,
2805                                 struct inode *dir, struct inode *inode,
2806                                 const char *name, int name_len)
2807 {
2808         struct btrfs_path *path;
2809         int ret = 0;
2810         struct extent_buffer *leaf;
2811         struct btrfs_dir_item *di;
2812         struct btrfs_key key;
2813         u64 index;
2814         u64 ino = btrfs_ino(inode);
2815         u64 dir_ino = btrfs_ino(dir);
2816
2817         path = btrfs_alloc_path();
2818         if (!path) {
2819                 ret = -ENOMEM;
2820                 goto out;
2821         }
2822
2823         path->leave_spinning = 1;
2824         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
2825                                     name, name_len, -1);
2826         if (IS_ERR(di)) {
2827                 ret = PTR_ERR(di);
2828                 goto err;
2829         }
2830         if (!di) {
2831                 ret = -ENOENT;
2832                 goto err;
2833         }
2834         leaf = path->nodes[0];
2835         btrfs_dir_item_key_to_cpu(leaf, di, &key);
2836         ret = btrfs_delete_one_dir_name(trans, root, path, di);
2837         if (ret)
2838                 goto err;
2839         btrfs_release_path(path);
2840
2841         ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
2842                                   dir_ino, &index);
2843         if (ret) {
2844                 printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
2845                        "inode %llu parent %llu\n", name_len, name,
2846                        (unsigned long long)ino, (unsigned long long)dir_ino);
2847                 btrfs_abort_transaction(trans, root, ret);
2848                 goto err;
2849         }
2850
2851         ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
2852         if (ret) {
2853                 btrfs_abort_transaction(trans, root, ret);
2854                 goto err;
2855         }
2856
2857         ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2858                                          inode, dir_ino);
2859         if (ret != 0 && ret != -ENOENT) {
2860                 btrfs_abort_transaction(trans, root, ret);
2861                 goto err;
2862         }
2863
2864         ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2865                                            dir, index);
2866         if (ret == -ENOENT)
2867                 ret = 0;
2868 err:
2869         btrfs_free_path(path);
2870         if (ret)
2871                 goto out;
2872
2873         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2874         inode_inc_iversion(inode);
2875         inode_inc_iversion(dir);
2876         inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2877         ret = btrfs_update_inode(trans, root, dir);
2878 out:
2879         return ret;
2880 }
2881
2882 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2883                        struct btrfs_root *root,
2884                        struct inode *dir, struct inode *inode,
2885                        const char *name, int name_len)
2886 {
2887         int ret;
2888         ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
2889         if (!ret) {
2890                 btrfs_drop_nlink(inode);
2891                 ret = btrfs_update_inode(trans, root, inode);
2892         }
2893         return ret;
2894 }
2895                 
2896
2897 /* helper to check if there is any shared block in the path */
2898 static int check_path_shared(struct btrfs_root *root,
2899                              struct btrfs_path *path)
2900 {
2901         struct extent_buffer *eb;
2902         int level;
2903         u64 refs = 1;
2904
2905         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2906                 int ret;
2907
2908                 if (!path->nodes[level])
2909                         break;
2910                 eb = path->nodes[level];
2911                 if (!btrfs_block_can_be_shared(root, eb))
2912                         continue;
2913                 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
2914                                                &refs, NULL);
2915                 if (refs > 1)
2916                         return 1;
2917         }
2918         return 0;
2919 }
2920
2921 /*
2922  * helper to start transaction for unlink and rmdir.
2923  *
2924  * unlink and rmdir are special in btrfs, they do not always free space.
2925  * so in enospc case, we should make sure they will free space before
2926  * allowing them to use the global metadata reservation.
2927  */
2928 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2929                                                        struct dentry *dentry)
2930 {
2931         struct btrfs_trans_handle *trans;
2932         struct btrfs_root *root = BTRFS_I(dir)->root;
2933         struct btrfs_path *path;
2934         struct btrfs_dir_item *di;
2935         struct inode *inode = dentry->d_inode;
2936         u64 index;
2937         int check_link = 1;
2938         int err = -ENOSPC;
2939         int ret;
2940         u64 ino = btrfs_ino(inode);
2941         u64 dir_ino = btrfs_ino(dir);
2942
2943         /*
2944          * 1 for the possible orphan item
2945          * 1 for the dir item
2946          * 1 for the dir index
2947          * 1 for the inode ref
2948          * 1 for the inode ref in the tree log
2949          * 2 for the dir entries in the log
2950          * 1 for the inode
2951          */
2952         trans = btrfs_start_transaction(root, 8);
2953         if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2954                 return trans;
2955
2956         if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
2957                 return ERR_PTR(-ENOSPC);
2958
2959         /* check if there is someone else holds reference */
2960         if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
2961                 return ERR_PTR(-ENOSPC);
2962
2963         if (atomic_read(&inode->i_count) > 2)
2964                 return ERR_PTR(-ENOSPC);
2965
2966         if (xchg(&root->fs_info->enospc_unlink, 1))
2967                 return ERR_PTR(-ENOSPC);
2968
2969         path = btrfs_alloc_path();
2970         if (!path) {
2971                 root->fs_info->enospc_unlink = 0;
2972                 return ERR_PTR(-ENOMEM);
2973         }
2974
2975         /* 1 for the orphan item */
2976         trans = btrfs_start_transaction(root, 1);
2977         if (IS_ERR(trans)) {
2978                 btrfs_free_path(path);
2979                 root->fs_info->enospc_unlink = 0;
2980                 return trans;
2981         }
2982
2983         path->skip_locking = 1;
2984         path->search_commit_root = 1;
2985
2986         ret = btrfs_lookup_inode(trans, root, path,
2987                                 &BTRFS_I(dir)->location, 0);
2988         if (ret < 0) {
2989                 err = ret;
2990                 goto out;
2991         }
2992         if (ret == 0) {
2993                 if (check_path_shared(root, path))
2994                         goto out;
2995         } else {
2996                 check_link = 0;
2997         }
2998         btrfs_release_path(path);
2999
3000         ret = btrfs_lookup_inode(trans, root, path,
3001                                 &BTRFS_I(inode)->location, 0);
3002         if (ret < 0) {
3003                 err = ret;
3004                 goto out;
3005         }
3006         if (ret == 0) {
3007                 if (check_path_shared(root, path))
3008                         goto out;
3009         } else {
3010                 check_link = 0;
3011         }
3012         btrfs_release_path(path);
3013
3014         if (ret == 0 && S_ISREG(inode->i_mode)) {
3015                 ret = btrfs_lookup_file_extent(trans, root, path,
3016                                                ino, (u64)-1, 0);
3017                 if (ret < 0) {
3018                         err = ret;
3019                         goto out;
3020                 }
3021                 BUG_ON(ret == 0); /* Corruption */
3022                 if (check_path_shared(root, path))
3023                         goto out;
3024                 btrfs_release_path(path);
3025         }
3026
3027         if (!check_link) {
3028                 err = 0;
3029                 goto out;
3030         }
3031
3032         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3033                                 dentry->d_name.name, dentry->d_name.len, 0);
3034         if (IS_ERR(di)) {
3035                 err = PTR_ERR(di);
3036                 goto out;
3037         }
3038         if (di) {
3039                 if (check_path_shared(root, path))
3040                         goto out;
3041         } else {
3042                 err = 0;
3043                 goto out;
3044         }
3045         btrfs_release_path(path);
3046
3047         ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
3048                                         dentry->d_name.len, ino, dir_ino, 0,
3049                                         &index);
3050         if (ret) {
3051                 err = ret;
3052                 goto out;
3053         }
3054
3055         if (check_path_shared(root, path))
3056                 goto out;
3057
3058         btrfs_release_path(path);
3059
3060         /*
3061          * This is a commit root search, if we can lookup inode item and other
3062          * relative items in the commit root, it means the transaction of
3063          * dir/file creation has been committed, and the dir index item that we
3064          * delay to insert has also been inserted into the commit root. So
3065          * we needn't worry about the delayed insertion of the dir index item
3066          * here.
3067          */
3068         di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
3069                                 dentry->d_name.name, dentry->d_name.len, 0);
3070         if (IS_ERR(di)) {
3071                 err = PTR_ERR(di);
3072                 goto out;
3073         }
3074         BUG_ON(ret == -ENOENT);
3075         if (check_path_shared(root, path))
3076                 goto out;
3077
3078         err = 0;
3079 out:
3080         btrfs_free_path(path);
3081         /* Migrate the orphan reservation over */
3082         if (!err)
3083                 err = btrfs_block_rsv_migrate(trans->block_rsv,
3084                                 &root->fs_info->global_block_rsv,
3085                                 trans->bytes_reserved);
3086
3087         if (err) {
3088                 btrfs_end_transaction(trans, root);
3089                 root->fs_info->enospc_unlink = 0;
3090                 return ERR_PTR(err);
3091         }
3092
3093         trans->block_rsv = &root->fs_info->global_block_rsv;
3094         return trans;
3095 }
3096
3097 static void __unlink_end_trans(struct btrfs_trans_handle *trans,
3098                                struct btrfs_root *root)
3099 {
3100         if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
3101                 btrfs_block_rsv_release(root, trans->block_rsv,
3102                                         trans->bytes_reserved);
3103                 trans->block_rsv = &root->fs_info->trans_block_rsv;
3104                 BUG_ON(!root->fs_info->enospc_unlink);
3105                 root->fs_info->enospc_unlink = 0;
3106         }
3107         btrfs_end_transaction(trans, root);
3108 }
3109
3110 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3111 {
3112         struct btrfs_root *root = BTRFS_I(dir)->root;
3113         struct btrfs_trans_handle *trans;
3114         struct inode *inode = dentry->d_inode;
3115         int ret;
3116
3117         trans = __unlink_start_trans(dir, dentry);
3118         if (IS_ERR(trans))
3119                 return PTR_ERR(trans);
3120
3121         btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
3122
3123         ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3124                                  dentry->d_name.name, dentry->d_name.len);
3125         if (ret)
3126                 goto out;
3127
3128         if (inode->i_nlink == 0) {
3129                 ret = btrfs_orphan_add(trans, inode);
3130                 if (ret)
3131                         goto out;
3132         }
3133
3134 out:
3135         __unlink_end_trans(trans, root);
3136         btrfs_btree_balance_dirty(root);
3137         return ret;
3138 }
3139
3140 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3141                         struct btrfs_root *root,
3142                         struct inode *dir, u64 objectid,
3143                         const char *name, int name_len)
3144 {
3145         struct btrfs_path *path;
3146         struct extent_buffer *leaf;
3147         struct btrfs_dir_item *di;
3148         struct btrfs_key key;
3149         u64 index;
3150         int ret;
3151         u64 dir_ino = btrfs_ino(dir);
3152
3153         path = btrfs_alloc_path();
3154         if (!path)
3155                 return -ENOMEM;
3156
3157         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3158                                    name, name_len, -1);
3159         if (IS_ERR_OR_NULL(di)) {
3160                 if (!di)
3161                         ret = -ENOENT;
3162                 else
3163                         ret = PTR_ERR(di);
3164                 goto out;
3165         }
3166
3167         leaf = path->nodes[0];
3168         btrfs_dir_item_key_to_cpu(leaf, di, &key);
3169         WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
3170         ret = btrfs_delete_one_dir_name(trans, root, path, di);
3171         if (ret) {
3172                 btrfs_abort_transaction(trans, root, ret);
3173                 goto out;
3174         }
3175         btrfs_release_path(path);
3176
3177         ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
3178                                  objectid, root->root_key.objectid,
3179                                  dir_ino, &index, name, name_len);
3180         if (ret < 0) {
3181                 if (ret != -ENOENT) {
3182                         btrfs_abort_transaction(trans, root, ret);
3183                         goto out;
3184                 }
3185                 di = btrfs_search_dir_index_item(root, path, dir_ino,
3186                                                  name, name_len);
3187                 if (IS_ERR_OR_NULL(di)) {
3188                         if (!di)
3189                                 ret = -ENOENT;
3190                         else
3191                                 ret = PTR_ERR(di);
3192                         btrfs_abort_transaction(trans, root, ret);
3193                         goto out;
3194                 }
3195
3196                 leaf = path->nodes[0];
3197                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3198                 btrfs_release_path(path);
3199                 index = key.offset;
3200         }
3201         btrfs_release_path(path);
3202
3203         ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3204         if (ret) {
3205                 btrfs_abort_transaction(trans, root, ret);
3206                 goto out;
3207         }
3208
3209         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3210         inode_inc_iversion(dir);
3211         dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3212         ret = btrfs_update_inode_fallback(trans, root, dir);
3213         if (ret)
3214                 btrfs_abort_transaction(trans, root, ret);
3215 out:
3216         btrfs_free_path(path);
3217         return ret;
3218 }
3219
3220 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3221 {
3222         struct inode *inode = dentry->d_inode;
3223         int err = 0;
3224         struct btrfs_root *root = BTRFS_I(dir)->root;
3225         struct btrfs_trans_handle *trans;
3226
3227         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3228                 return -ENOTEMPTY;
3229         if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3230                 return -EPERM;
3231
3232         trans = __unlink_start_trans(dir, dentry);
3233         if (IS_ERR(trans))
3234                 return PTR_ERR(trans);
3235
3236         if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
3237                 err = btrfs_unlink_subvol(trans, root, dir,
3238                                           BTRFS_I(inode)->location.objectid,
3239                                           dentry->d_name.name,
3240                                           dentry->d_name.len);
3241                 goto out;
3242         }
3243
3244         err = btrfs_orphan_add(trans, inode);
3245         if (err)
3246                 goto out;
3247
3248         /* now the directory is empty */
3249         err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3250                                  dentry->d_name.name, dentry->d_name.len);
3251         if (!err)
3252                 btrfs_i_size_write(inode, 0);
3253 out:
3254         __unlink_end_trans(trans, root);
3255         btrfs_btree_balance_dirty(root);
3256
3257         return err;
3258 }
3259
3260 /*
3261  * this can truncate away extent items, csum items and directory items.
3262  * It starts at a high offset and removes keys until it can't find
3263  * any higher than new_size
3264  *
3265  * csum items that cross the new i_size are truncated to the new size
3266  * as well.
3267  *
3268  * min_type is the minimum key type to truncate down to.  If set to 0, this
3269  * will kill all the items on this inode, including the INODE_ITEM_KEY.
3270  */
3271 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3272                                struct btrfs_root *root,
3273                                struct inode *inode,
3274                                u64 new_size, u32 min_type)
3275 {
3276         struct btrfs_path *path;
3277         struct extent_buffer *leaf;
3278         struct btrfs_file_extent_item *fi;
3279         struct btrfs_key key;
3280         struct btrfs_key found_key;
3281         u64 extent_start = 0;
3282         u64 extent_num_bytes = 0;
3283         u64 extent_offset = 0;
3284         u64 item_end = 0;
3285         u64 mask = root->sectorsize - 1;
3286         u32 found_type = (u8)-1;
3287         int found_extent;
3288         int del_item;
3289         int pending_del_nr = 0;
3290         int pending_del_slot = 0;
3291         int extent_type = -1;
3292         int ret;
3293         int err = 0;
3294         u64 ino = btrfs_ino(inode);
3295
3296         BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
3297
3298         path = btrfs_alloc_path();
3299         if (!path)
3300                 return -ENOMEM;
3301         path->reada = -1;
3302
3303         /*
3304          * We want to drop from the next block forward in case this new size is
3305          * not block aligned since we will be keeping the last block of the
3306          * extent just the way it is.
3307          */
3308         if (root->ref_cows || root == root->fs_info->tree_root)
3309                 btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
3310
3311         /*
3312          * This function is also used to drop the items in the log tree before
3313          * we relog the inode, so if root != BTRFS_I(inode)->root, it means
3314          * it is used to drop the loged items. So we shouldn't kill the delayed
3315          * items.
3316          */
3317         if (min_type == 0 && root == BTRFS_I(inode)->root)
3318                 btrfs_kill_delayed_inode_items(inode);
3319
3320         key.objectid = ino;
3321         key.offset = (u64)-1;
3322         key.type = (u8)-1;
3323
3324 search_again:
3325         path->leave_spinning = 1;
3326         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3327         if (ret < 0) {
3328                 err = ret;
3329                 goto out;
3330         }
3331
3332         if (ret > 0) {
3333                 /* there are no items in the tree for us to truncate, we're
3334                  * done
3335                  */
3336                 if (path->slots[0] == 0)
3337                         goto out;
3338                 path->slots[0]--;
3339         }
3340
3341         while (1) {
3342                 fi = NULL;
3343                 leaf = path->nodes[0];
3344                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3345                 found_type = btrfs_key_type(&found_key);
3346
3347                 if (found_key.objectid != ino)
3348                         break;
3349
3350                 if (found_type < min_type)
3351                         break;
3352
3353                 item_end = found_key.offset;
3354                 if (found_type == BTRFS_EXTENT_DATA_KEY) {
3355                         fi = btrfs_item_ptr(leaf, path->slots[0],
3356                                             struct btrfs_file_extent_item);
3357                         extent_type = btrfs_file_extent_type(leaf, fi);
3358                         if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3359                                 item_end +=
3360                                     btrfs_file_extent_num_bytes(leaf, fi);
3361                         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3362                                 item_end += btrfs_file_extent_inline_len(leaf,
3363                                                                          fi);
3364                         }
3365                         item_end--;
3366                 }
3367                 if (found_type > min_type) {
3368                         del_item = 1;
3369                 } else {
3370                         if (item_end < new_size)
3371                                 break;
3372                         if (found_key.offset >= new_size)
3373                                 del_item = 1;
3374                         else
3375                                 del_item = 0;
3376                 }
3377                 found_extent = 0;
3378                 /* FIXME, shrink the extent if the ref count is only 1 */
3379                 if (found_type != BTRFS_EXTENT_DATA_KEY)
3380                         goto delete;
3381
3382                 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3383                         u64 num_dec;
3384                         extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
3385                         if (!del_item) {
3386                                 u64 orig_num_bytes =
3387                                         btrfs_file_extent_num_bytes(leaf, fi);
3388                                 extent_num_bytes = new_size -
3389                                         found_key.offset + root->sectorsize - 1;
3390                                 extent_num_bytes = extent_num_bytes &
3391                                         ~((u64)root->sectorsize - 1);
3392                                 btrfs_set_file_extent_num_bytes(leaf, fi,
3393                                                          extent_num_bytes);
3394                                 num_dec = (orig_num_bytes -
3395                                            extent_num_bytes);
3396                                 if (root->ref_cows && extent_start != 0)
3397                                         inode_sub_bytes(inode, num_dec);
3398                                 btrfs_mark_buffer_dirty(leaf);
3399                         } else {
3400                                 extent_num_bytes =
3401                                         btrfs_file_extent_disk_num_bytes(leaf,
3402                                                                          fi);
3403                                 extent_offset = found_key.offset -
3404                                         btrfs_file_extent_offset(leaf, fi);
3405
3406                                 /* FIXME blocksize != 4096 */
3407                                 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
3408                                 if (extent_start != 0) {
3409                                         found_extent = 1;
3410                                         if (root->ref_cows)
3411                                                 inode_sub_bytes(inode, num_dec);
3412                                 }
3413                         }
3414                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3415                         /*
3416                          * we can't truncate inline items that have had
3417                          * special encodings
3418                          */
3419                         if (!del_item &&
3420                             btrfs_file_extent_compression(leaf, fi) == 0 &&
3421                             btrfs_file_extent_encryption(leaf, fi) == 0 &&
3422                             btrfs_file_extent_other_encoding(leaf, fi) == 0) {
3423                                 u32 size = new_size - found_key.offset;
3424
3425                                 if (root->ref_cows) {
3426                                         inode_sub_bytes(inode, item_end + 1 -
3427                                                         new_size);
3428                                 }
3429                                 size =
3430                                     btrfs_file_extent_calc_inline_size(size);
3431                                 btrfs_truncate_item(trans, root, path,
3432                                                     size, 1);
3433                         } else if (root->ref_cows) {
3434                                 inode_sub_bytes(inode, item_end + 1 -
3435                                                 found_key.offset);
3436                         }
3437                 }
3438 delete:
3439                 if (del_item) {
3440                         if (!pending_del_nr) {
3441                                 /* no pending yet, add ourselves */
3442                                 pending_del_slot = path->slots[0];
3443                                 pending_del_nr = 1;
3444                         } else if (pending_del_nr &&
3445                                    path->slots[0] + 1 == pending_del_slot) {
3446                                 /* hop on the pending chunk */
3447                                 pending_del_nr++;
3448                                 pending_del_slot = path->slots[0];
3449                         } else {
3450                                 BUG();
3451                         }
3452                 } else {
3453                         break;
3454                 }
3455                 if (found_extent && (root->ref_cows ||
3456                                      root == root->fs_info->tree_root)) {
3457                         btrfs_set_path_blocking(path);
3458                         ret = btrfs_free_extent(trans, root, extent_start,
3459                                                 extent_num_bytes, 0,
3460                                                 btrfs_header_owner(leaf),
3461                                                 ino, extent_offset, 0);
3462                         BUG_ON(ret);
3463                 }
3464
3465                 if (found_type == BTRFS_INODE_ITEM_KEY)
3466                         break;
3467
3468                 if (path->slots[0] == 0 ||
3469                     path->slots[0] != pending_del_slot) {
3470                         if (pending_del_nr) {
3471                                 ret = btrfs_del_items(trans, root, path,
3472                                                 pending_del_slot,
3473                                                 pending_del_nr);
3474                                 if (ret) {
3475                                         btrfs_abort_transaction(trans,
3476                                                                 root, ret);
3477                                         goto error;
3478                                 }
3479                                 pending_del_nr = 0;
3480                         }
3481                         btrfs_release_path(path);
3482                         goto search_again;
3483                 } else {
3484                         path->slots[0]--;
3485                 }
3486         }
3487 out:
3488         if (pending_del_nr) {
3489                 ret = btrfs_del_items(trans, root, path, pending_del_slot,
3490                                       pending_del_nr);
3491                 if (ret)
3492                         btrfs_abort_transaction(trans, root, ret);
3493         }
3494 error:
3495         btrfs_free_path(path);
3496         return err;
3497 }
3498
3499 /*
3500  * btrfs_truncate_page - read, zero a chunk and write a page
3501  * @inode - inode that we're zeroing
3502  * @from - the offset to start zeroing
3503  * @len - the length to zero, 0 to zero the entire range respective to the
3504  *      offset
3505  * @front - zero up to the offset instead of from the offset on
3506  *
3507  * This will find the page for the "from" offset and cow the page and zero the
3508  * part we want to zero.  This is used with truncate and hole punching.
3509  */
3510 int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3511                         int front)
3512 {
3513         struct address_space *mapping = inode->i_mapping;
3514         struct btrfs_root *root = BTRFS_I(inode)->root;
3515         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3516         struct btrfs_ordered_extent *ordered;
3517         struct extent_state *cached_state = NULL;
3518         char *kaddr;
3519         u32 blocksize = root->sectorsize;
3520         pgoff_t index = from >> PAGE_CACHE_SHIFT;
3521         unsigned offset = from & (PAGE_CACHE_SIZE-1);
3522         struct page *page;
3523         gfp_t mask = btrfs_alloc_write_mask(mapping);
3524         int ret = 0;
3525         u64 page_start;
3526         u64 page_end;
3527
3528         if ((offset & (blocksize - 1)) == 0 &&
3529             (!len || ((len & (blocksize - 1)) == 0)))
3530                 goto out;
3531         ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3532         if (ret)
3533                 goto out;
3534
3535 again:
3536         page = find_or_create_page(mapping, index, mask);
3537         if (!page) {
3538                 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3539                 ret = -ENOMEM;
3540                 goto out;
3541         }
3542
3543         page_start = page_offset(page);
3544         page_end = page_start + PAGE_CACHE_SIZE - 1;
3545
3546         if (!PageUptodate(page)) {
3547                 ret = btrfs_readpage(NULL, page);
3548                 lock_page(page);
3549                 if (page->mapping != mapping) {
3550                         unlock_page(page);
3551                         page_cache_release(page);
3552                         goto again;
3553                 }
3554                 if (!PageUptodate(page)) {
3555                         ret = -EIO;
3556                         goto out_unlock;
3557                 }
3558         }
3559         wait_on_page_writeback(page);
3560
3561         lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
3562         set_page_extent_mapped(page);
3563
3564         ordered = btrfs_lookup_ordered_extent(inode, page_start);
3565         if (ordered) {
3566                 unlock_extent_cached(io_tree, page_start, page_end,
3567                                      &cached_state, GFP_NOFS);
3568                 unlock_page(page);
3569                 page_cache_release(page);
3570                 btrfs_start_ordered_extent(inode, ordered, 1);
3571                 btrfs_put_ordered_extent(ordered);
3572                 goto again;
3573         }
3574
3575         clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3576                           EXTENT_DIRTY | EXTENT_DELALLOC |
3577                           EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
3578                           0, 0, &cached_state, GFP_NOFS);
3579
3580         ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
3581                                         &cached_state);
3582         if (ret) {
3583                 unlock_extent_cached(io_tree, page_start, page_end,
3584                                      &cached_state, GFP_NOFS);
3585                 goto out_unlock;
3586         }
3587
3588         if (offset != PAGE_CACHE_SIZE) {
3589                 if (!len)
3590                         len = PAGE_CACHE_SIZE - offset;
3591                 kaddr = kmap(page);
3592                 if (front)
3593                         memset(kaddr, 0, offset);
3594                 else
3595                         memset(kaddr + offset, 0, len);
3596                 flush_dcache_page(page);
3597                 kunmap(page);
3598         }
3599         ClearPageChecked(page);
3600         set_page_dirty(page);
3601         unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
3602                              GFP_NOFS);
3603
3604 out_unlock:
3605         if (ret)
3606                 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3607         unlock_page(page);
3608         page_cache_release(page);
3609 out:
3610         return ret;
3611 }
3612
3613 /*
3614  * This function puts in dummy file extents for the area we're creating a hole
3615  * for.  So if we are truncating this file to a larger size we need to insert
3616  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
3617  * the range between oldsize and size
3618  */
3619 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3620 {
3621         struct btrfs_trans_handle *trans;
3622         struct btrfs_root *root = BTRFS_I(inode)->root;
3623         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3624         struct extent_map *em = NULL;
3625         struct extent_state *cached_state = NULL;
3626         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3627         u64 mask = root->sectorsize - 1;
3628         u64 hole_start = (oldsize + mask) & ~mask;
3629         u64 block_end = (size + mask) & ~mask;
3630         u64 last_byte;
3631         u64 cur_offset;
3632         u64 hole_size;
3633         int err = 0;
3634
3635         if (size <= hole_start)
3636                 return 0;
3637
3638         while (1) {
3639                 struct btrfs_ordered_extent *ordered;
3640                 btrfs_wait_ordered_range(inode, hole_start,
3641                                          block_end - hole_start);
3642                 lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
3643                                  &cached_state);
3644                 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
3645                 if (!ordered)
3646                         break;
3647                 unlock_extent_cached(io_tree, hole_start, block_end - 1,
3648                                      &cached_state, GFP_NOFS);
3649                 btrfs_put_ordered_extent(ordered);
3650         }
3651
3652         cur_offset = hole_start;
3653         while (1) {
3654                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
3655                                 block_end - cur_offset, 0);
3656                 if (IS_ERR(em)) {
3657                         err = PTR_ERR(em);
3658                         break;
3659                 }
3660                 last_byte = min(extent_map_end(em), block_end);
3661                 last_byte = (last_byte + mask) & ~mask;
3662                 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3663                         struct extent_map *hole_em;
3664                         hole_size = last_byte - cur_offset;
3665
3666                         trans = btrfs_start_transaction(root, 3);
3667                         if (IS_ERR(trans)) {
3668                                 err = PTR_ERR(trans);
3669                                 break;
3670                         }
3671
3672                         err = btrfs_drop_extents(trans, root, inode,
3673                                                  cur_offset,
3674                                                  cur_offset + hole_size, 1);
3675                         if (err) {
3676                                 btrfs_abort_transaction(trans, root, err);
3677                                 btrfs_end_transaction(trans, root);
3678                                 break;
3679                         }
3680
3681                         err = btrfs_insert_file_extent(trans, root,
3682                                         btrfs_ino(inode), cur_offset, 0,
3683                                         0, hole_size, 0, hole_size,
3684                                         0, 0, 0);
3685                         if (err) {
3686                                 btrfs_abort_transaction(trans, root, err);
3687                                 btrfs_end_transaction(trans, root);
3688                                 break;
3689                         }
3690
3691                         btrfs_drop_extent_cache(inode, cur_offset,
3692                                                 cur_offset + hole_size - 1, 0);
3693                         hole_em = alloc_extent_map();
3694                         if (!hole_em) {
3695                                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3696                                         &BTRFS_I(inode)->runtime_flags);
3697                                 goto next;
3698                         }
3699                         hole_em->start = cur_offset;
3700                         hole_em->len = hole_size;
3701                         hole_em->orig_start = cur_offset;
3702
3703                         hole_em->block_start = EXTENT_MAP_HOLE;
3704                         hole_em->block_len = 0;
3705                         hole_em->orig_block_len = 0;
3706                         hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
3707                         hole_em->compress_type = BTRFS_COMPRESS_NONE;
3708                         hole_em->generation = trans->transid;
3709
3710                         while (1) {
3711                                 write_lock(&em_tree->lock);
3712                                 err = add_extent_mapping(em_tree, hole_em);
3713                                 if (!err)
3714                                         list_move(&hole_em->list,
3715                                                   &em_tree->modified_extents);
3716                                 write_unlock(&em_tree->lock);
3717                                 if (err != -EEXIST)
3718                                         break;
3719                                 btrfs_drop_extent_cache(inode, cur_offset,
3720                                                         cur_offset +
3721                                                         hole_size - 1, 0);
3722                         }
3723                         free_extent_map(hole_em);
3724 next:
3725                         btrfs_update_inode(trans, root, inode);
3726                         btrfs_end_transaction(trans, root);
3727                 }
3728                 free_extent_map(em);
3729                 em = NULL;
3730                 cur_offset = last_byte;
3731                 if (cur_offset >= block_end)
3732                         break;
3733         }
3734
3735         free_extent_map(em);
3736         unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3737                              GFP_NOFS);
3738         return err;
3739 }
3740
3741 static int btrfs_setsize(struct inode *inode, loff_t newsize)
3742 {
3743         struct btrfs_root *root = BTRFS_I(inode)->root;
3744         struct btrfs_trans_handle *trans;
3745         loff_t oldsize = i_size_read(inode);
3746         int ret;
3747
3748         if (newsize == oldsize)
3749                 return 0;
3750
3751         if (newsize > oldsize) {
3752                 truncate_pagecache(inode, oldsize, newsize);
3753                 ret = btrfs_cont_expand(inode, oldsize, newsize);
3754                 if (ret)
3755                         return ret;
3756
3757                 trans = btrfs_start_transaction(root, 1);
3758                 if (IS_ERR(trans))
3759                         return PTR_ERR(trans);
3760
3761                 i_size_write(inode, newsize);
3762                 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3763                 ret = btrfs_update_inode(trans, root, inode);
3764                 btrfs_end_transaction(trans, root);
3765         } else {
3766
3767                 /*
3768                  * We're truncating a file that used to have good data down to
3769                  * zero. Make sure it gets into the ordered flush list so that
3770                  * any new writes get down to disk quickly.
3771                  */
3772                 if (newsize == 0)
3773                         set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
3774                                 &BTRFS_I(inode)->runtime_flags);
3775
3776                 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3777                 truncate_setsize(inode, newsize);
3778                 ret = btrfs_truncate(inode);
3779         }
3780
3781         return ret;
3782 }
3783
3784 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3785 {
3786         struct inode *inode = dentry->d_inode;
3787         struct btrfs_root *root = BTRFS_I(inode)->root;
3788         int err;
3789
3790         if (btrfs_root_readonly(root))
3791                 return -EROFS;
3792
3793         err = inode_change_ok(inode, attr);
3794         if (err)
3795                 return err;
3796
3797         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
3798                 err = btrfs_setsize(inode, attr->ia_size);
3799                 if (err)
3800                         return err;
3801         }
3802
3803         if (attr->ia_valid) {
3804                 setattr_copy(inode, attr);
3805                 inode_inc_iversion(inode);
3806                 err = btrfs_dirty_inode(inode);
3807
3808                 if (!err && attr->ia_valid & ATTR_MODE)
3809                         err = btrfs_acl_chmod(inode);
3810         }
3811
3812         return err;
3813 }
3814
3815 void btrfs_evict_inode(struct inode *inode)
3816 {
3817         struct btrfs_trans_handle *trans;
3818         struct btrfs_root *root = BTRFS_I(inode)->root;
3819         struct btrfs_block_rsv *rsv, *global_rsv;
3820         u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3821         int ret;
3822
3823         trace_btrfs_inode_evict(inode);
3824
3825         truncate_inode_pages(&inode->i_data, 0);
3826         if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3827                                btrfs_is_free_space_inode(inode)))
3828                 goto no_delete;
3829
3830         if (is_bad_inode(inode)) {
3831                 btrfs_orphan_del(NULL, inode);
3832                 goto no_delete;
3833         }
3834         /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
3835         btrfs_wait_ordered_range(inode, 0, (u64)-1);
3836
3837         if (root->fs_info->log_root_recovering) {
3838                 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3839                                  &BTRFS_I(inode)->runtime_flags));
3840                 goto no_delete;
3841         }
3842
3843         if (inode->i_nlink > 0) {
3844                 BUG_ON(btrfs_root_refs(&root->root_item) != 0);
3845                 goto no_delete;
3846         }
3847
3848         rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3849         if (!rsv) {
3850                 btrfs_orphan_del(NULL, inode);
3851                 goto no_delete;
3852         }
3853         rsv->size = min_size;
3854         rsv->failfast = 1;
3855         global_rsv = &root->fs_info->global_block_rsv;
3856
3857         btrfs_i_size_write(inode, 0);
3858
3859         /*
3860          * This is a bit simpler than btrfs_truncate since we've already
3861          * reserved our space for our orphan item in the unlink, so we just
3862          * need to reserve some slack space in case we add bytes and update
3863          * inode item when doing the truncate.
3864          */
3865         while (1) {
3866                 ret = btrfs_block_rsv_refill(root, rsv, min_size,
3867                                              BTRFS_RESERVE_FLUSH_LIMIT);
3868
3869                 /*
3870                  * Try and steal from the global reserve since we will
3871                  * likely not use this space anyway, we want to try as
3872                  * hard as possible to get this to work.
3873                  */
3874                 if (ret)
3875                         ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
3876
3877                 if (ret) {
3878                         printk(KERN_WARNING "Could not get space for a "
3879                                "delete, will truncate on mount %d\n", ret);
3880                         btrfs_orphan_del(NULL, inode);
3881                         btrfs_free_block_rsv(root, rsv);
3882                         goto no_delete;
3883                 }
3884
3885                 trans = btrfs_start_transaction_lflush(root, 1);
3886                 if (IS_ERR(trans)) {
3887                         btrfs_orphan_del(NULL, inode);
3888                         btrfs_free_block_rsv(root, rsv);
3889                         goto no_delete;
3890                 }
3891
3892                 trans->block_rsv = rsv;
3893
3894                 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3895                 if (ret != -ENOSPC)
3896                         break;
3897
3898                 trans->block_rsv = &root->fs_info->trans_block_rsv;
3899                 ret = btrfs_update_inode(trans, root, inode);
3900                 BUG_ON(ret);
3901
3902                 btrfs_end_transaction(trans, root);
3903                 trans = NULL;
3904                 btrfs_btree_balance_dirty(root);
3905         }
3906
3907         btrfs_free_block_rsv(root, rsv);
3908
3909         if (ret == 0) {
3910                 trans->block_rsv = root->orphan_block_rsv;
3911                 ret = btrfs_orphan_del(trans, inode);
3912                 BUG_ON(ret);
3913         }
3914
3915         trans->block_rsv = &root->fs_info->trans_block_rsv;
3916         if (!(root == root->fs_info->tree_root ||
3917               root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3918                 btrfs_return_ino(root, btrfs_ino(inode));
3919
3920         btrfs_end_transaction(trans, root);
3921         btrfs_btree_balance_dirty(root);
3922 no_delete:
3923         clear_inode(inode);
3924         return;
3925 }
3926
3927 /*
3928  * this returns the key found in the dir entry in the location pointer.
3929  * If no dir entries were found, location->objectid is 0.
3930  */
3931 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
3932                                struct btrfs_key *location)
3933 {
3934         const char *name = dentry->d_name.name;
3935         int namelen = dentry->d_name.len;
3936         struct btrfs_dir_item *di;
3937         struct btrfs_path *path;
3938         struct btrfs_root *root = BTRFS_I(dir)->root;
3939         int ret = 0;
3940
3941         path = btrfs_alloc_path();
3942         if (!path)
3943                 return -ENOMEM;
3944
3945         di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
3946                                     namelen, 0);
3947         if (IS_ERR(di))
3948                 ret = PTR_ERR(di);
3949
3950         if (IS_ERR_OR_NULL(di))
3951                 goto out_err;
3952
3953         btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
3954 out:
3955         btrfs_free_path(path);
3956         return ret;
3957 out_err:
3958         location->objectid = 0;
3959         goto out;
3960 }
3961
3962 /*
3963  * when we hit a tree root in a directory, the btrfs part of the inode
3964  * needs to be changed to reflect the root directory of the tree root.  This
3965  * is kind of like crossing a mount point.
3966  */
3967 static int fixup_tree_root_location(struct btrfs_root *root,
3968                                     struct inode *dir,
3969                                     struct dentry *dentry,
3970                                     struct btrfs_key *location,
3971                                     struct btrfs_root **sub_root)
3972 {
3973         struct btrfs_path *path;
3974         struct btrfs_root *new_root;
3975         struct btrfs_root_ref *ref;
3976         struct extent_buffer *leaf;
3977         int ret;
3978         int err = 0;
3979
3980         path = btrfs_alloc_path();
3981         if (!path) {
3982                 err = -ENOMEM;
3983                 goto out;
3984         }
3985
3986         err = -ENOENT;
3987         ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
3988                                   BTRFS_I(dir)->root->root_key.objectid,
3989                                   location->objectid);
3990         if (ret) {
3991                 if (ret < 0)
3992                         err = ret;
3993                 goto out;
3994         }
3995
3996         leaf = path->nodes[0];
3997         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
3998         if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
3999             btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
4000                 goto out;
4001
4002         ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
4003                                    (unsigned long)(ref + 1),
4004                                    dentry->d_name.len);
4005         if (ret)
4006                 goto out;
4007
4008         btrfs_release_path(path);
4009
4010         new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
4011         if (IS_ERR(new_root)) {
4012                 err = PTR_ERR(new_root);
4013                 goto out;
4014         }
4015
4016         if (btrfs_root_refs(&new_root->root_item) == 0) {
4017                 err = -ENOENT;
4018                 goto out;
4019         }
4020
4021         *sub_root = new_root;
4022         location->objectid = btrfs_root_dirid(&new_root->root_item);
4023         location->type = BTRFS_INODE_ITEM_KEY;
4024         location->offset = 0;
4025         err = 0;
4026 out:
4027         btrfs_free_path(path);
4028         return err;
4029 }
4030
4031 static void inode_tree_add(struct inode *inode)
4032 {
4033         struct btrfs_root *root = BTRFS_I(inode)->root;
4034         struct btrfs_inode *entry;
4035         struct rb_node **p;
4036         struct rb_node *parent;
4037         u64 ino = btrfs_ino(inode);
4038 again:
4039         p = &root->inode_tree.rb_node;
4040         parent = NULL;
4041
4042         if (inode_unhashed(inode))
4043                 return;
4044
4045         spin_lock(&root->inode_lock);
4046         while (*p) {
4047                 parent = *p;
4048                 entry = rb_entry(parent, struct btrfs_inode, rb_node);
4049
4050                 if (ino < btrfs_ino(&entry->vfs_inode))
4051                         p = &parent->rb_left;
4052                 else if (ino > btrfs_ino(&entry->vfs_inode))
4053                         p = &parent->rb_right;
4054                 else {
4055                         WARN_ON(!(entry->vfs_inode.i_state &
4056                                   (I_WILL_FREE | I_FREEING)));
4057                         rb_erase(parent, &root->inode_tree);
4058                         RB_CLEAR_NODE(parent);
4059                         spin_unlock(&root->inode_lock);
4060                         goto again;
4061                 }
4062         }
4063         rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
4064         rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
4065         spin_unlock(&root->inode_lock);
4066 }
4067
4068 static void inode_tree_del(struct inode *inode)
4069 {
4070         struct btrfs_root *root = BTRFS_I(inode)->root;
4071         int empty = 0;
4072
4073         spin_lock(&root->inode_lock);
4074         if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
4075                 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
4076                 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
4077                 empty = RB_EMPTY_ROOT(&root->inode_tree);
4078         }
4079         spin_unlock(&root->inode_lock);
4080
4081         /*
4082          * Free space cache has inodes in the tree root, but the tree root has a
4083          * root_refs of 0, so this could end up dropping the tree root as a
4084          * snapshot, so we need the extra !root->fs_info->tree_root check to
4085          * make sure we don't drop it.
4086          */
4087         if (empty && btrfs_root_refs(&root->root_item) == 0 &&
4088             root != root->fs_info->tree_root) {
4089                 synchronize_srcu(&root->fs_info->subvol_srcu);
4090                 spin_lock(&root->inode_lock);
4091                 empty = RB_EMPTY_ROOT(&root->inode_tree);
4092                 spin_unlock(&root->inode_lock);
4093                 if (empty)
4094                         btrfs_add_dead_root(root);
4095         }
4096 }
4097
4098 void btrfs_invalidate_inodes(struct btrfs_root *root)
4099 {
4100         struct rb_node *node;
4101         struct rb_node *prev;
4102         struct btrfs_inode *entry;
4103         struct inode *inode;
4104         u64 objectid = 0;
4105
4106         WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4107
4108         spin_lock(&root->inode_lock);
4109 again:
4110         node = root->inode_tree.rb_node;
4111         prev = NULL;
4112         while (node) {
4113                 prev = node;
4114                 entry = rb_entry(node, struct btrfs_inode, rb_node);
4115
4116                 if (objectid < btrfs_ino(&entry->vfs_inode))
4117                         node = node->rb_left;
4118                 else if (objectid > btrfs_ino(&entry->vfs_inode))
4119                         node = node->rb_right;
4120                 else
4121                         break;
4122         }
4123         if (!node) {
4124                 while (prev) {
4125                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
4126                         if (objectid <= btrfs_ino(&entry->vfs_inode)) {
4127                                 node = prev;
4128                                 break;
4129                         }
4130                         prev = rb_next(prev);
4131                 }
4132         }
4133         while (node) {
4134                 entry = rb_entry(node, struct btrfs_inode, rb_node);
4135                 objectid = btrfs_ino(&entry->vfs_inode) + 1;
4136                 inode = igrab(&entry->vfs_inode);
4137                 if (inode) {
4138                         spin_unlock(&root->inode_lock);
4139                         if (atomic_read(&inode->i_count) > 1)
4140                                 d_prune_aliases(inode);
4141                         /*
4142                          * btrfs_drop_inode will have it removed from
4143                          * the inode cache when its usage count
4144                          * hits zero.
4145                          */
4146                         iput(inode);
4147                         cond_resched();
4148                         spin_lock(&root->inode_lock);
4149                         goto again;
4150                 }
4151
4152                 if (cond_resched_lock(&root->inode_lock))
4153                         goto again;
4154
4155                 node = rb_next(node);
4156         }
4157         spin_unlock(&root->inode_lock);
4158 }
4159
4160 static int btrfs_init_locked_inode(struct inode *inode, void *p)
4161 {
4162         struct btrfs_iget_args *args = p;
4163         inode->i_ino = args->ino;
4164         BTRFS_I(inode)->root = args->root;
4165         return 0;
4166 }
4167
4168 static int btrfs_find_actor(struct inode *inode, void *opaque)
4169 {
4170         struct btrfs_iget_args *args = opaque;
4171         return args->ino == btrfs_ino(inode) &&
4172                 args->root == BTRFS_I(inode)->root;
4173 }
4174
4175 static struct inode *btrfs_iget_locked(struct super_block *s,
4176                                        u64 objectid,
4177                                        struct btrfs_root *root)
4178 {
4179         struct inode *inode;
4180         struct btrfs_iget_args args;
4181         args.ino = objectid;
4182         args.root = root;
4183
4184         inode = iget5_locked(s, objectid, btrfs_find_actor,
4185                              btrfs_init_locked_inode,
4186                              (void *)&args);
4187         return inode;
4188 }
4189
4190 /* Get an inode object given its location and corresponding root.
4191  * Returns in *is_new if the inode was read from disk
4192  */
4193 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
4194                          struct btrfs_root *root, int *new)
4195 {
4196         struct inode *inode;
4197
4198         inode = btrfs_iget_locked(s, location->objectid, root);
4199         if (!inode)
4200                 return ERR_PTR(-ENOMEM);
4201
4202         if (inode->i_state & I_NEW) {
4203                 BTRFS_I(inode)->root = root;
4204                 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
4205                 btrfs_read_locked_inode(inode);
4206                 if (!is_bad_inode(inode)) {
4207                         inode_tree_add(inode);
4208                         unlock_new_inode(inode);
4209                         if (new)
4210                                 *new = 1;
4211                 } else {
4212                         unlock_new_inode(inode);
4213                         iput(inode);
4214                         inode = ERR_PTR(-ESTALE);
4215                 }
4216         }
4217
4218         return inode;
4219 }
4220
4221 static struct inode *new_simple_dir(struct super_block *s,
4222                                     struct btrfs_key *key,
4223                                     struct btrfs_root *root)
4224 {
4225         struct inode *inode = new_inode(s);
4226
4227         if (!inode)
4228                 return ERR_PTR(-ENOMEM);
4229
4230         BTRFS_I(inode)->root = root;
4231         memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
4232         set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
4233
4234         inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
4235         inode->i_op = &btrfs_dir_ro_inode_operations;
4236         inode->i_fop = &simple_dir_operations;
4237         inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
4238         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
4239
4240         return inode;
4241 }
4242
4243 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4244 {
4245         struct inode *inode;
4246         struct btrfs_root *root = BTRFS_I(dir)->root;
4247         struct btrfs_root *sub_root = root;
4248         struct btrfs_key location;
4249         int index;
4250         int ret = 0;
4251
4252         if (dentry->d_name.len > BTRFS_NAME_LEN)
4253                 return ERR_PTR(-ENAMETOOLONG);
4254
4255         if (unlikely(d_need_lookup(dentry))) {
4256                 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
4257                 kfree(dentry->d_fsdata);
4258                 dentry->d_fsdata = NULL;
4259                 /* This thing is hashed, drop it for now */
4260                 d_drop(dentry);
4261         } else {
4262                 ret = btrfs_inode_by_name(dir, dentry, &location);
4263         }
4264
4265         if (ret < 0)
4266                 return ERR_PTR(ret);
4267
4268         if (location.objectid == 0)
4269                 return NULL;
4270
4271         if (location.type == BTRFS_INODE_ITEM_KEY) {
4272                 inode = btrfs_iget(dir->i_sb, &location, root, NULL);
4273                 return inode;
4274         }
4275
4276         BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
4277
4278         index = srcu_read_lock(&root->fs_info->subvol_srcu);
4279         ret = fixup_tree_root_location(root, dir, dentry,
4280                                        &location, &sub_root);
4281         if (ret < 0) {
4282                 if (ret != -ENOENT)
4283                         inode = ERR_PTR(ret);
4284                 else
4285                         inode = new_simple_dir(dir->i_sb, &location, sub_root);
4286         } else {
4287                 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
4288         }
4289         srcu_read_unlock(&root->fs_info->subvol_srcu, index);
4290
4291         if (!IS_ERR(inode) && root != sub_root) {
4292                 down_read(&root->fs_info->cleanup_work_sem);
4293                 if (!(inode->i_sb->s_flags & MS_RDONLY))
4294                         ret = btrfs_orphan_cleanup(sub_root);
4295                 up_read(&root->fs_info->cleanup_work_sem);
4296                 if (ret)
4297                         inode = ERR_PTR(ret);
4298         }
4299
4300         return inode;
4301 }
4302
4303 static int btrfs_dentry_delete(const struct dentry *dentry)
4304 {
4305         struct btrfs_root *root;
4306         struct inode *inode = dentry->d_inode;
4307
4308         if (!inode && !IS_ROOT(dentry))
4309                 inode = dentry->d_parent->d_inode;
4310
4311         if (inode) {
4312                 root = BTRFS_I(inode)->root;
4313                 if (btrfs_root_refs(&root->root_item) == 0)
4314                         return 1;
4315
4316                 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
4317                         return 1;
4318         }
4319         return 0;
4320 }
4321
4322 static void btrfs_dentry_release(struct dentry *dentry)
4323 {
4324         if (dentry->d_fsdata)
4325                 kfree(dentry->d_fsdata);
4326 }
4327
4328 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4329                                    unsigned int flags)
4330 {
4331         struct dentry *ret;
4332
4333         ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
4334         if (unlikely(d_need_lookup(dentry))) {
4335                 spin_lock(&dentry->d_lock);
4336                 dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
4337                 spin_unlock(&dentry->d_lock);
4338         }
4339         return ret;
4340 }
4341
4342 unsigned char btrfs_filetype_table[] = {
4343         DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
4344 };
4345
4346 static int btrfs_real_readdir(struct file *filp, void *dirent,
4347                               filldir_t filldir)
4348 {
4349         struct inode *inode = filp->f_dentry->d_inode;
4350         struct btrfs_root *root = BTRFS_I(inode)->root;
4351         struct btrfs_item *item;
4352         struct btrfs_dir_item *di;
4353         struct btrfs_key key;
4354         struct btrfs_key found_key;
4355         struct btrfs_path *path;
4356         struct list_head ins_list;
4357         struct list_head del_list;
4358         int ret;
4359         struct extent_buffer *leaf;
4360         int slot;
4361         unsigned char d_type;
4362         int over = 0;
4363         u32 di_cur;
4364         u32 di_total;
4365         u32 di_len;
4366         int key_type = BTRFS_DIR_INDEX_KEY;
4367         char tmp_name[32];
4368         char *name_ptr;
4369         int name_len;
4370         int is_curr = 0;        /* filp->f_pos points to the current index? */
4371
4372         /* FIXME, use a real flag for deciding about the key type */
4373         if (root->fs_info->tree_root == root)
4374                 key_type = BTRFS_DIR_ITEM_KEY;
4375
4376         /* special case for "." */
4377         if (filp->f_pos == 0) {
4378                 over = filldir(dirent, ".", 1,
4379                                filp->f_pos, btrfs_ino(inode), DT_DIR);
4380                 if (over)
4381                         return 0;
4382                 filp->f_pos = 1;
4383         }
4384         /* special case for .., just use the back ref */
4385         if (filp->f_pos == 1) {
4386                 u64 pino = parent_ino(filp->f_path.dentry);
4387                 over = filldir(dirent, "..", 2,
4388                                filp->f_pos, pino, DT_DIR);
4389                 if (over)
4390                         return 0;
4391                 filp->f_pos = 2;
4392         }
4393         path = btrfs_alloc_path();
4394         if (!path)
4395                 return -ENOMEM;
4396
4397         path->reada = 1;
4398
4399         if (key_type == BTRFS_DIR_INDEX_KEY) {
4400                 INIT_LIST_HEAD(&ins_list);
4401                 INIT_LIST_HEAD(&del_list);
4402                 btrfs_get_delayed_items(inode, &ins_list, &del_list);
4403         }
4404
4405         btrfs_set_key_type(&key, key_type);
4406         key.offset = filp->f_pos;
4407         key.objectid = btrfs_ino(inode);
4408
4409         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4410         if (ret < 0)
4411                 goto err;
4412
4413         while (1) {
4414                 leaf = path->nodes[0];
4415                 slot = path->slots[0];
4416                 if (slot >= btrfs_header_nritems(leaf)) {
4417                         ret = btrfs_next_leaf(root, path);
4418                         if (ret < 0)
4419                                 goto err;
4420                         else if (ret > 0)
4421                                 break;
4422                         continue;
4423                 }
4424
4425                 item = btrfs_item_nr(leaf, slot);
4426                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4427
4428                 if (found_key.objectid != key.objectid)
4429                         break;
4430                 if (btrfs_key_type(&found_key) != key_type)
4431                         break;
4432                 if (found_key.offset < filp->f_pos)
4433                         goto next;
4434                 if (key_type == BTRFS_DIR_INDEX_KEY &&
4435                     btrfs_should_delete_dir_index(&del_list,
4436                                                   found_key.offset))
4437                         goto next;
4438
4439                 filp->f_pos = found_key.offset;
4440                 is_curr = 1;
4441
4442                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
4443                 di_cur = 0;
4444                 di_total = btrfs_item_size(leaf, item);
4445
4446                 while (di_cur < di_total) {
4447                         struct btrfs_key location;
4448
4449                         if (verify_dir_item(root, leaf, di))
4450                                 break;
4451
4452                         name_len = btrfs_dir_name_len(leaf, di);
4453                         if (name_len <= sizeof(tmp_name)) {
4454                                 name_ptr = tmp_name;
4455                         } else {
4456                                 name_ptr = kmalloc(name_len, GFP_NOFS);
4457                                 if (!name_ptr) {
4458                                         ret = -ENOMEM;
4459                                         goto err;
4460                                 }
4461                         }
4462                         read_extent_buffer(leaf, name_ptr,
4463                                            (unsigned long)(di + 1), name_len);
4464
4465                         d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
4466                         btrfs_dir_item_key_to_cpu(leaf, di, &location);
4467
4468
4469                         /* is this a reference to our own snapshot? If so
4470                          * skip it.
4471                          *
4472                          * In contrast to old kernels, we insert the snapshot's
4473                          * dir item and dir index after it has been created, so
4474                          * we won't find a reference to our own snapshot. We
4475                          * still keep the following code for backward
4476                          * compatibility.
4477                          */
4478                         if (location.type == BTRFS_ROOT_ITEM_KEY &&
4479                             location.objectid == root->root_key.objectid) {
4480                                 over = 0;
4481                                 goto skip;
4482                         }
4483                         over = filldir(dirent, name_ptr, name_len,
4484                                        found_key.offset, location.objectid,
4485                                        d_type);
4486
4487 skip:
4488                         if (name_ptr != tmp_name)
4489                                 kfree(name_ptr);
4490
4491                         if (over)
4492                                 goto nopos;
4493                         di_len = btrfs_dir_name_len(leaf, di) +
4494                                  btrfs_dir_data_len(leaf, di) + sizeof(*di);
4495                         di_cur += di_len;
4496                         di = (struct btrfs_dir_item *)((char *)di + di_len);
4497                 }
4498 next:
4499                 path->slots[0]++;
4500         }
4501
4502         if (key_type == BTRFS_DIR_INDEX_KEY) {
4503                 if (is_curr)
4504                         filp->f_pos++;
4505                 ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
4506                                                       &ins_list);
4507                 if (ret)
4508                         goto nopos;
4509         }
4510
4511         /* Reached end of directory/root. Bump pos past the last item. */
4512         if (key_type == BTRFS_DIR_INDEX_KEY)
4513                 /*
4514                  * 32-bit glibc will use getdents64, but then strtol -
4515                  * so the last number we can serve is this.
4516                  */
4517                 filp->f_pos = 0x7fffffff;
4518         else
4519                 filp->f_pos++;
4520 nopos:
4521         ret = 0;
4522 err:
4523         if (key_type == BTRFS_DIR_INDEX_KEY)
4524                 btrfs_put_delayed_items(&ins_list, &del_list);
4525         btrfs_free_path(path);
4526         return ret;
4527 }
4528
4529 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4530 {
4531         struct btrfs_root *root = BTRFS_I(inode)->root;
4532         struct btrfs_trans_handle *trans;
4533         int ret = 0;
4534         bool nolock = false;
4535
4536         if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
4537                 return 0;
4538
4539         if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
4540                 nolock = true;
4541
4542         if (wbc->sync_mode == WB_SYNC_ALL) {
4543                 if (nolock)
4544                         trans = btrfs_join_transaction_nolock(root);
4545                 else
4546                         trans = btrfs_join_transaction(root);
4547                 if (IS_ERR(trans))
4548                         return PTR_ERR(trans);
4549                 ret = btrfs_commit_transaction(trans, root);
4550         }
4551         return ret;
4552 }
4553
4554 /*
4555  * This is somewhat expensive, updating the tree every time the
4556  * inode changes.  But, it is most likely to find the inode in cache.
4557  * FIXME, needs more benchmarking...there are no reasons other than performance
4558  * to keep or drop this code.
4559  */
4560 int btrfs_dirty_inode(struct inode *inode)
4561 {
4562         struct btrfs_root *root = BTRFS_I(inode)->root;
4563         struct btrfs_trans_handle *trans;
4564         int ret;
4565
4566         if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
4567                 return 0;
4568
4569         trans = btrfs_join_transaction(root);
4570         if (IS_ERR(trans))
4571                 return PTR_ERR(trans);
4572
4573         ret = btrfs_update_inode(trans, root, inode);
4574         if (ret && ret == -ENOSPC) {
4575                 /* whoops, lets try again with the full transaction */
4576                 btrfs_end_transaction(trans, root);
4577                 trans = btrfs_start_transaction(root, 1);
4578                 if (IS_ERR(trans))
4579                         return PTR_ERR(trans);
4580
4581                 ret = btrfs_update_inode(trans, root, inode);
4582         }
4583         btrfs_end_transaction(trans, root);
4584         if (BTRFS_I(inode)->delayed_node)
4585                 btrfs_balance_delayed_items(root);
4586
4587         return ret;
4588 }
4589
4590 /*
4591  * This is a copy of file_update_time.  We need this so we can return error on
4592  * ENOSPC for updating the inode in the case of file write and mmap writes.
4593  */
4594 static int btrfs_update_time(struct inode *inode, struct timespec *now,
4595                              int flags)
4596 {
4597         struct btrfs_root *root = BTRFS_I(inode)->root;
4598
4599         if (btrfs_root_readonly(root))
4600                 return -EROFS;
4601
4602         if (flags & S_VERSION)
4603                 inode_inc_iversion(inode);
4604         if (flags & S_CTIME)
4605                 inode->i_ctime = *now;
4606         if (flags & S_MTIME)
4607                 inode->i_mtime = *now;
4608         if (flags & S_ATIME)
4609                 inode->i_atime = *now;
4610         return btrfs_dirty_inode(inode);
4611 }
4612
4613 /*
4614  * find the highest existing sequence number in a directory
4615  * and then set the in-memory index_cnt variable to reflect
4616  * free sequence numbers
4617  */
4618 static int btrfs_set_inode_index_count(struct inode *inode)
4619 {
4620         struct btrfs_root *root = BTRFS_I(inode)->root;
4621         struct btrfs_key key, found_key;
4622         struct btrfs_path *path;
4623         struct extent_buffer *leaf;
4624         int ret;
4625
4626         key.objectid = btrfs_ino(inode);
4627         btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
4628         key.offset = (u64)-1;
4629
4630         path = btrfs_alloc_path();
4631         if (!path)
4632                 return -ENOMEM;
4633
4634         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4635         if (ret < 0)
4636                 goto out;
4637         /* FIXME: we should be able to handle this */
4638         if (ret == 0)
4639                 goto out;
4640         ret = 0;
4641
4642         /*
4643          * MAGIC NUMBER EXPLANATION:
4644          * since we search a directory based on f_pos we have to start at 2
4645          * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
4646          * else has to start at 2
4647          */
4648         if (path->slots[0] == 0) {
4649                 BTRFS_I(inode)->index_cnt = 2;
4650                 goto out;
4651         }
4652
4653         path->slots[0]--;
4654
4655         leaf = path->nodes[0];
4656         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4657
4658         if (found_key.objectid != btrfs_ino(inode) ||
4659             btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
4660                 BTRFS_I(inode)->index_cnt = 2;
4661                 goto out;
4662         }
4663
4664         BTRFS_I(inode)->index_cnt = found_key.offset + 1;
4665 out:
4666         btrfs_free_path(path);
4667         return ret;
4668 }
4669
4670 /*
4671  * helper to find a free sequence number in a given directory.  This current
4672  * code is very simple, later versions will do smarter things in the btree
4673  */
4674 int btrfs_set_inode_index(struct inode *dir, u64 *index)
4675 {
4676         int ret = 0;
4677
4678         if (BTRFS_I(dir)->index_cnt == (u64)-1) {
4679                 ret = btrfs_inode_delayed_dir_index_count(dir);
4680                 if (ret) {
4681                         ret = btrfs_set_inode_index_count(dir);
4682                         if (ret)
4683                                 return ret;
4684                 }
4685         }
4686
4687         *index = BTRFS_I(dir)->index_cnt;
4688         BTRFS_I(dir)->index_cnt++;
4689
4690         return ret;
4691 }
4692
4693 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4694                                      struct btrfs_root *root,
4695                                      struct inode *dir,
4696                                      const char *name, int name_len,
4697                                      u64 ref_objectid, u64 objectid,
4698                                      umode_t mode, u64 *index)
4699 {
4700         struct inode *inode;
4701         struct btrfs_inode_item *inode_item;
4702         struct btrfs_key *location;
4703         struct btrfs_path *path;
4704         struct btrfs_inode_ref *ref;
4705         struct btrfs_key key[2];
4706         u32 sizes[2];
4707         unsigned long ptr;
4708         int ret;
4709         int owner;
4710
4711         path = btrfs_alloc_path();
4712         if (!path)
4713                 return ERR_PTR(-ENOMEM);
4714
4715         inode = new_inode(root->fs_info->sb);
4716         if (!inode) {
4717                 btrfs_free_path(path);
4718                 return ERR_PTR(-ENOMEM);
4719         }
4720
4721         /*
4722          * we have to initialize this early, so we can reclaim the inode
4723          * number if we fail afterwards in this function.
4724          */
4725         inode->i_ino = objectid;
4726
4727         if (dir) {
4728                 trace_btrfs_inode_request(dir);
4729
4730                 ret = btrfs_set_inode_index(dir, index);
4731                 if (ret) {
4732                         btrfs_free_path(path);
4733                         iput(inode);
4734                         return ERR_PTR(ret);
4735                 }
4736         }
4737         /*
4738          * index_cnt is ignored for everything but a dir,
4739          * btrfs_get_inode_index_count has an explanation for the magic
4740          * number
4741          */
4742         BTRFS_I(inode)->index_cnt = 2;
4743         BTRFS_I(inode)->root = root;
4744         BTRFS_I(inode)->generation = trans->transid;
4745         inode->i_generation = BTRFS_I(inode)->generation;
4746
4747         /*
4748          * We could have gotten an inode number from somebody who was fsynced
4749          * and then removed in this same transaction, so let's just set full
4750          * sync since it will be a full sync anyway and this will blow away the
4751          * old info in the log.
4752          */
4753         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
4754
4755         if (S_ISDIR(mode))
4756                 owner = 0;
4757         else
4758                 owner = 1;
4759
4760         key[0].objectid = objectid;
4761         btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
4762         key[0].offset = 0;
4763
4764         /*
4765          * Start new inodes with an inode_ref. This is slightly more
4766          * efficient for small numbers of hard links since they will
4767          * be packed into one item. Extended refs will kick in if we
4768          * add more hard links than can fit in the ref item.
4769          */
4770         key[1].objectid = objectid;
4771         btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
4772         key[1].offset = ref_objectid;
4773
4774         sizes[0] = sizeof(struct btrfs_inode_item);
4775         sizes[1] = name_len + sizeof(*ref);
4776
4777         path->leave_spinning = 1;
4778         ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
4779         if (ret != 0)
4780                 goto fail;
4781
4782         inode_init_owner(inode, dir, mode);
4783         inode_set_bytes(inode, 0);
4784         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
4785         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4786                                   struct btrfs_inode_item);
4787         memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
4788                              sizeof(*inode_item));
4789         fill_inode_item(trans, path->nodes[0], inode_item, inode);
4790
4791         ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
4792                              struct btrfs_inode_ref);
4793         btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
4794         btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
4795         ptr = (unsigned long)(ref + 1);
4796         write_extent_buffer(path->nodes[0], name, ptr, name_len);
4797
4798         btrfs_mark_buffer_dirty(path->nodes[0]);
4799         btrfs_free_path(path);
4800
4801         location = &BTRFS_I(inode)->location;
4802         location->objectid = objectid;
4803         location->offset = 0;
4804         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
4805
4806         btrfs_inherit_iflags(inode, dir);
4807
4808         if (S_ISREG(mode)) {
4809                 if (btrfs_test_opt(root, NODATASUM))
4810                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4811                 if (btrfs_test_opt(root, NODATACOW) ||
4812                     (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
4813                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
4814         }
4815
4816         insert_inode_hash(inode);
4817         inode_tree_add(inode);
4818
4819         trace_btrfs_inode_new(inode);
4820         btrfs_set_inode_last_trans(trans, inode);
4821
4822         btrfs_update_root_times(trans, root);
4823
4824         return inode;
4825 fail:
4826         if (dir)
4827                 BTRFS_I(dir)->index_cnt--;
4828         btrfs_free_path(path);
4829         iput(inode);
4830         return ERR_PTR(ret);
4831 }
4832
4833 static inline u8 btrfs_inode_type(struct inode *inode)
4834 {
4835         return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
4836 }
4837
4838 /*
4839  * utility function to add 'inode' into 'parent_inode' with
4840  * a give name and a given sequence number.
4841  * if 'add_backref' is true, also insert a backref from the
4842  * inode to the parent directory.
4843  */
4844 int btrfs_add_link(struct btrfs_trans_handle *trans,
4845                    struct inode *parent_inode, struct inode *inode,
4846                    const char *name, int name_len, int add_backref, u64 index)
4847 {
4848         int ret = 0;
4849         struct btrfs_key key;
4850         struct btrfs_root *root = BTRFS_I(parent_inode)->root;
4851         u64 ino = btrfs_ino(inode);
4852         u64 parent_ino = btrfs_ino(parent_inode);
4853
4854         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
4855                 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
4856         } else {
4857                 key.objectid = ino;
4858                 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
4859                 key.offset = 0;
4860         }
4861
4862         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
4863                 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
4864                                          key.objectid, root->root_key.objectid,
4865                                          parent_ino, index, name, name_len);
4866         } else if (add_backref) {
4867                 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
4868                                              parent_ino, index);
4869         }
4870
4871         /* Nothing to clean up yet */
4872         if (ret)
4873                 return ret;
4874
4875         ret = btrfs_insert_dir_item(trans, root, name, name_len,
4876                                     parent_inode, &key,
4877                                     btrfs_inode_type(inode), index);
4878         if (ret == -EEXIST)
4879                 goto fail_dir_item;
4880         else if (ret) {
4881                 btrfs_abort_transaction(trans, root, ret);
4882                 return ret;
4883         }
4884
4885         btrfs_i_size_write(parent_inode, parent_inode->i_size +
4886                            name_len * 2);
4887         inode_inc_iversion(parent_inode);
4888         parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
4889         ret = btrfs_update_inode(trans, root, parent_inode);
4890         if (ret)
4891                 btrfs_abort_transaction(trans, root, ret);
4892         return ret;
4893
4894 fail_dir_item:
4895         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
4896                 u64 local_index;
4897                 int err;
4898                 err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
4899                                  key.objectid, root->root_key.objectid,
4900                                  parent_ino, &local_index, name, name_len);
4901
4902         } else if (add_backref) {
4903                 u64 local_index;
4904                 int err;
4905
4906                 err = btrfs_del_inode_ref(trans, root, name, name_len,
4907                                           ino, parent_ino, &local_index);
4908         }
4909         return ret;
4910 }
4911
4912 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
4913                             struct inode *dir, struct dentry *dentry,
4914                             struct inode *inode, int backref, u64 index)
4915 {
4916         int err = btrfs_add_link(trans, dir, inode,
4917                                  dentry->d_name.name, dentry->d_name.len,
4918                                  backref, index);
4919         if (err > 0)
4920                 err = -EEXIST;
4921         return err;
4922 }
4923
4924 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4925                         umode_t mode, dev_t rdev)
4926 {
4927         struct btrfs_trans_handle *trans;
4928         struct btrfs_root *root = BTRFS_I(dir)->root;
4929         struct inode *inode = NULL;
4930         int err;
4931         int drop_inode = 0;
4932         u64 objectid;
4933         u64 index = 0;
4934
4935         if (!new_valid_dev(rdev))
4936                 return -EINVAL;
4937
4938         /*
4939          * 2 for inode item and ref
4940          * 2 for dir items
4941          * 1 for xattr if selinux is on
4942          */
4943         trans = btrfs_start_transaction(root, 5);
4944         if (IS_ERR(trans))
4945                 return PTR_ERR(trans);
4946
4947         err = btrfs_find_free_ino(root, &objectid);
4948         if (err)
4949                 goto out_unlock;
4950
4951         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4952                                 dentry->d_name.len, btrfs_ino(dir), objectid,
4953                                 mode, &index);
4954         if (IS_ERR(inode)) {
4955                 err = PTR_ERR(inode);
4956                 goto out_unlock;
4957         }
4958
4959         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4960         if (err) {
4961                 drop_inode = 1;
4962                 goto out_unlock;
4963         }
4964
4965         err = btrfs_update_inode(trans, root, inode);
4966         if (err) {
4967                 drop_inode = 1;
4968                 goto out_unlock;
4969         }
4970
4971         /*
4972         * If the active LSM wants to access the inode during
4973         * d_instantiate it needs these. Smack checks to see
4974         * if the filesystem supports xattrs by looking at the
4975         * ops vector.
4976         */
4977
4978         inode->i_op = &btrfs_special_inode_operations;
4979         err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4980         if (err)
4981                 drop_inode = 1;
4982         else {
4983                 init_special_inode(inode, inode->i_mode, rdev);
4984                 btrfs_update_inode(trans, root, inode);
4985                 d_instantiate(dentry, inode);
4986         }
4987 out_unlock:
4988         btrfs_end_transaction(trans, root);
4989         btrfs_btree_balance_dirty(root);
4990         if (drop_inode) {
4991                 inode_dec_link_count(inode);
4992                 iput(inode);
4993         }
4994         return err;
4995 }
4996
4997 static int btrfs_create(struct inode *dir, struct dentry *dentry,
4998                         umode_t mode, bool excl)
4999 {
5000         struct btrfs_trans_handle *trans;
5001         struct btrfs_root *root = BTRFS_I(dir)->root;
5002         struct inode *inode = NULL;
5003         int drop_inode_on_err = 0;
5004         int err;
5005         u64 objectid;
5006         u64 index = 0;
5007
5008         /*
5009          * 2 for inode item and ref
5010          * 2 for dir items
5011          * 1 for xattr if selinux is on
5012          */
5013         trans = btrfs_start_transaction(root, 5);
5014         if (IS_ERR(trans))
5015                 return PTR_ERR(trans);
5016
5017         err = btrfs_find_free_ino(root, &objectid);
5018         if (err)
5019                 goto out_unlock;
5020
5021         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5022                                 dentry->d_name.len, btrfs_ino(dir), objectid,
5023                                 mode, &index);
5024         if (IS_ERR(inode)) {
5025                 err = PTR_ERR(inode);
5026                 goto out_unlock;
5027         }
5028         drop_inode_on_err = 1;
5029
5030         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5031         if (err)
5032                 goto out_unlock;
5033
5034         /*
5035         * If the active LSM wants to access the inode during
5036         * d_instantiate it needs these. Smack checks to see
5037         * if the filesystem supports xattrs by looking at the
5038         * ops vector.
5039         */
5040         inode->i_fop = &btrfs_file_operations;
5041         inode->i_op = &btrfs_file_inode_operations;
5042
5043         err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5044         if (err)
5045                 goto out_unlock;
5046
5047         inode->i_mapping->a_ops = &btrfs_aops;
5048         inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5049         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
5050         d_instantiate(dentry, inode);
5051
5052 out_unlock:
5053         btrfs_end_transaction(trans, root);
5054         if (err && drop_inode_on_err) {
5055                 inode_dec_link_count(inode);
5056                 iput(inode);
5057         }
5058         btrfs_btree_balance_dirty(root);
5059         return err;
5060 }
5061
5062 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5063                       struct dentry *dentry)
5064 {
5065         struct btrfs_trans_handle *trans;
5066         struct btrfs_root *root = BTRFS_I(dir)->root;
5067         struct inode *inode = old_dentry->d_inode;
5068         u64 index;
5069         int err;
5070         int drop_inode = 0;
5071
5072         /* do not allow sys_link's with other subvols of the same device */
5073         if (root->objectid != BTRFS_I(inode)->root->objectid)
5074                 return -EXDEV;
5075
5076         if (inode->i_nlink >= BTRFS_LINK_MAX)
5077                 return -EMLINK;
5078
5079         err = btrfs_set_inode_index(dir, &index);
5080         if (err)
5081                 goto fail;
5082
5083         /*
5084          * 2 items for inode and inode ref
5085          * 2 items for dir items
5086          * 1 item for parent inode
5087          */
5088         trans = btrfs_start_transaction(root, 5);
5089         if (IS_ERR(trans)) {
5090                 err = PTR_ERR(trans);
5091                 goto fail;
5092         }
5093
5094         btrfs_inc_nlink(inode);
5095         inode_inc_iversion(inode);
5096         inode->i_ctime = CURRENT_TIME;
5097         ihold(inode);
5098         set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
5099
5100         err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
5101
5102         if (err) {
5103                 drop_inode = 1;
5104         } else {
5105                 struct dentry *parent = dentry->d_parent;
5106                 err = btrfs_update_inode(trans, root, inode);
5107                 if (err)
5108                         goto fail;
5109                 d_instantiate(dentry, inode);
5110                 btrfs_log_new_name(trans, inode, NULL, parent);
5111         }
5112
5113         btrfs_end_transaction(trans, root);
5114 fail:
5115         if (drop_inode) {
5116                 inode_dec_link_count(inode);
5117                 iput(inode);
5118         }
5119         btrfs_btree_balance_dirty(root);
5120         return err;
5121 }
5122
5123 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5124 {
5125         struct inode *inode = NULL;
5126         struct btrfs_trans_handle *trans;
5127         struct btrfs_root *root = BTRFS_I(dir)->root;
5128         int err = 0;
5129         int drop_on_err = 0;
5130         u64 objectid = 0;
5131         u64 index = 0;
5132
5133         /*
5134          * 2 items for inode and ref
5135          * 2 items for dir items
5136          * 1 for xattr if selinux is on
5137          */
5138         trans = btrfs_start_transaction(root, 5);
5139         if (IS_ERR(trans))
5140                 return PTR_ERR(trans);
5141
5142         err = btrfs_find_free_ino(root, &objectid);
5143         if (err)
5144                 goto out_fail;
5145
5146         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5147                                 dentry->d_name.len, btrfs_ino(dir), objectid,
5148                                 S_IFDIR | mode, &index);
5149         if (IS_ERR(inode)) {
5150                 err = PTR_ERR(inode);
5151                 goto out_fail;
5152         }
5153
5154         drop_on_err = 1;
5155
5156         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5157         if (err)
5158                 goto out_fail;
5159
5160         inode->i_op = &btrfs_dir_inode_operations;
5161         inode->i_fop = &btrfs_dir_file_operations;
5162
5163         btrfs_i_size_write(inode, 0);
5164         err = btrfs_update_inode(trans, root, inode);
5165         if (err)
5166                 goto out_fail;
5167
5168         err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
5169                              dentry->d_name.len, 0, index);
5170         if (err)
5171                 goto out_fail;
5172
5173         d_instantiate(dentry, inode);
5174         drop_on_err = 0;
5175
5176 out_fail:
5177         btrfs_end_transaction(trans, root);
5178         if (drop_on_err)
5179                 iput(inode);
5180         btrfs_btree_balance_dirty(root);
5181         return err;
5182 }
5183
5184 /* helper for btfs_get_extent.  Given an existing extent in the tree,
5185  * and an extent that you want to insert, deal with overlap and insert
5186  * the new extent into the tree.
5187  */
5188 static int merge_extent_mapping(struct extent_map_tree *em_tree,
5189                                 struct extent_map *existing,
5190                                 struct extent_map *em,
5191                                 u64 map_start, u64 map_len)
5192 {
5193         u64 start_diff;
5194
5195         BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
5196         start_diff = map_start - em->start;
5197         em->start = map_start;
5198         em->len = map_len;
5199         if (em->block_start < EXTENT_MAP_LAST_BYTE &&
5200             !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
5201                 em->block_start += start_diff;
5202                 em->block_len -= start_diff;
5203         }
5204         return add_extent_mapping(em_tree, em);
5205 }
5206
5207 static noinline int uncompress_inline(struct btrfs_path *path,
5208                                       struct inode *inode, struct page *page,
5209                                       size_t pg_offset, u64 extent_offset,
5210                                       struct btrfs_file_extent_item *item)
5211 {
5212         int ret;
5213         struct extent_buffer *leaf = path->nodes[0];
5214         char *tmp;
5215         size_t max_size;
5216         unsigned long inline_size;
5217         unsigned long ptr;
5218         int compress_type;
5219
5220         WARN_ON(pg_offset != 0);
5221         compress_type = btrfs_file_extent_compression(leaf, item);
5222         max_size = btrfs_file_extent_ram_bytes(leaf, item);
5223         inline_size = btrfs_file_extent_inline_item_len(leaf,
5224                                         btrfs_item_nr(leaf, path->slots[0]));
5225         tmp = kmalloc(inline_size, GFP_NOFS);
5226         if (!tmp)
5227                 return -ENOMEM;
5228         ptr = btrfs_file_extent_inline_start(item);
5229
5230         read_extent_buffer(leaf, tmp, ptr, inline_size);
5231
5232         max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
5233         ret = btrfs_decompress(compress_type, tmp, page,
5234                                extent_offset, inline_size, max_size);
5235         if (ret) {
5236                 char *kaddr = kmap_atomic(page);
5237                 unsigned long copy_size = min_t(u64,
5238                                   PAGE_CACHE_SIZE - pg_offset,
5239                                   max_size - extent_offset);
5240                 memset(kaddr + pg_offset, 0, copy_size);
5241                 kunmap_atomic(kaddr);
5242         }
5243         kfree(tmp);
5244         return 0;
5245 }
5246
5247 /*
5248  * a bit scary, this does extent mapping from logical file offset to the disk.
5249  * the ugly parts come from merging extents from the disk with the in-ram
5250  * representation.  This gets more complex because of the data=ordered code,
5251  * where the in-ram extents might be locked pending data=ordered completion.
5252  *
5253  * This also copies inline extents directly into the page.
5254  */
5255
5256 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
5257                                     size_t pg_offset, u64 start, u64 len,
5258                                     int create)
5259 {
5260         int ret;
5261         int err = 0;
5262         u64 bytenr;
5263         u64 extent_start = 0;
5264         u64 extent_end = 0;
5265         u64 objectid = btrfs_ino(inode);
5266         u32 found_type;
5267         struct btrfs_path *path = NULL;
5268         struct btrfs_root *root = BTRFS_I(inode)->root;
5269         struct btrfs_file_extent_item *item;
5270         struct extent_buffer *leaf;
5271         struct btrfs_key found_key;
5272         struct extent_map *em = NULL;
5273         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5274         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5275         struct btrfs_trans_handle *trans = NULL;
5276         int compress_type;
5277
5278 again:
5279         read_lock(&em_tree->lock);
5280         em = lookup_extent_mapping(em_tree, start, len);
5281         if (em)
5282                 em->bdev = root->fs_info->fs_devices->latest_bdev;
5283         read_unlock(&em_tree->lock);
5284
5285         if (em) {
5286                 if (em->start > start || em->start + em->len <= start)
5287                         free_extent_map(em);
5288                 else if (em->block_start == EXTENT_MAP_INLINE && page)
5289                         free_extent_map(em);
5290                 else
5291                         goto out;
5292         }
5293         em = alloc_extent_map();
5294         if (!em) {
5295                 err = -ENOMEM;
5296                 goto out;
5297         }
5298         em->bdev = root->fs_info->fs_devices->latest_bdev;
5299         em->start = EXTENT_MAP_HOLE;
5300         em->orig_start = EXTENT_MAP_HOLE;
5301         em->len = (u64)-1;
5302         em->block_len = (u64)-1;
5303
5304         if (!path) {
5305                 path = btrfs_alloc_path();
5306                 if (!path) {
5307                         err = -ENOMEM;
5308                         goto out;
5309                 }
5310                 /*
5311                  * Chances are we'll be called again, so go ahead and do
5312                  * readahead
5313                  */
5314                 path->reada = 1;
5315         }
5316
5317         ret = btrfs_lookup_file_extent(trans, root, path,
5318                                        objectid, start, trans != NULL);
5319         if (ret < 0) {
5320                 err = ret;
5321                 goto out;
5322         }
5323
5324         if (ret != 0) {
5325                 if (path->slots[0] == 0)
5326                         goto not_found;
5327                 path->slots[0]--;
5328         }
5329
5330         leaf = path->nodes[0];
5331         item = btrfs_item_ptr(leaf, path->slots[0],
5332                               struct btrfs_file_extent_item);
5333         /* are we inside the extent that was found? */
5334         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5335         found_type = btrfs_key_type(&found_key);
5336         if (found_key.objectid != objectid ||
5337             found_type != BTRFS_EXTENT_DATA_KEY) {
5338                 goto not_found;
5339         }
5340
5341         found_type = btrfs_file_extent_type(leaf, item);
5342         extent_start = found_key.offset;
5343         compress_type = btrfs_file_extent_compression(leaf, item);
5344         if (found_type == BTRFS_FILE_EXTENT_REG ||
5345             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5346                 extent_end = extent_start +
5347                        btrfs_file_extent_num_bytes(leaf, item);
5348         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
5349                 size_t size;
5350                 size = btrfs_file_extent_inline_len(leaf, item);
5351                 extent_end = (extent_start + size + root->sectorsize - 1) &
5352                         ~((u64)root->sectorsize - 1);
5353         }
5354
5355         if (start >= extent_end) {
5356                 path->slots[0]++;
5357                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
5358                         ret = btrfs_next_leaf(root, path);
5359                         if (ret < 0) {
5360                                 err = ret;
5361                                 goto out;
5362                         }
5363                         if (ret > 0)
5364                                 goto not_found;
5365                         leaf = path->nodes[0];
5366                 }
5367                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5368                 if (found_key.objectid != objectid ||
5369                     found_key.type != BTRFS_EXTENT_DATA_KEY)
5370                         goto not_found;
5371                 if (start + len <= found_key.offset)
5372                         goto not_found;
5373                 em->start = start;
5374                 em->len = found_key.offset - start;
5375                 goto not_found_em;
5376         }
5377
5378         if (found_type == BTRFS_FILE_EXTENT_REG ||
5379             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5380                 em->start = extent_start;
5381                 em->len = extent_end - extent_start;
5382                 em->orig_start = extent_start -
5383                                  btrfs_file_extent_offset(leaf, item);
5384                 em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
5385                                                                       item);
5386                 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
5387                 if (bytenr == 0) {
5388                         em->block_start = EXTENT_MAP_HOLE;
5389                         goto insert;
5390                 }
5391                 if (compress_type != BTRFS_COMPRESS_NONE) {
5392                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5393                         em->compress_type = compress_type;
5394                         em->block_start = bytenr;
5395                         em->block_len = em->orig_block_len;
5396                 } else {
5397                         bytenr += btrfs_file_extent_offset(leaf, item);
5398                         em->block_start = bytenr;
5399                         em->block_len = em->len;
5400                         if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
5401                                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
5402                 }
5403                 goto insert;
5404         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
5405                 unsigned long ptr;
5406                 char *map;
5407                 size_t size;
5408                 size_t extent_offset;
5409                 size_t copy_size;
5410
5411                 em->block_start = EXTENT_MAP_INLINE;
5412                 if (!page || create) {
5413                         em->start = extent_start;
5414                         em->len = extent_end - extent_start;
5415                         goto out;
5416                 }
5417
5418                 size = btrfs_file_extent_inline_len(leaf, item);
5419                 extent_offset = page_offset(page) + pg_offset - extent_start;
5420                 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
5421                                 size - extent_offset);
5422                 em->start = extent_start + extent_offset;
5423                 em->len = (copy_size + root->sectorsize - 1) &
5424                         ~((u64)root->sectorsize - 1);
5425                 em->orig_block_len = em->len;
5426                 em->orig_start = EXTENT_MAP_INLINE;
5427                 if (compress_type) {
5428                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5429                         em->compress_type = compress_type;
5430                 }
5431                 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
5432                 if (create == 0 && !PageUptodate(page)) {
5433                         if (btrfs_file_extent_compression(leaf, item) !=
5434                             BTRFS_COMPRESS_NONE) {
5435                                 ret = uncompress_inline(path, inode, page,
5436                                                         pg_offset,
5437                                                         extent_offset, item);
5438                                 BUG_ON(ret); /* -ENOMEM */
5439                         } else {
5440                                 map = kmap(page);
5441                                 read_extent_buffer(leaf, map + pg_offset, ptr,
5442                                                    copy_size);
5443                                 if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
5444                                         memset(map + pg_offset + copy_size, 0,
5445                                                PAGE_CACHE_SIZE - pg_offset -
5446                                                copy_size);
5447                                 }
5448                                 kunmap(page);
5449                         }
5450                         flush_dcache_page(page);
5451                 } else if (create && PageUptodate(page)) {
5452                         BUG();
5453                         if (!trans) {
5454                                 kunmap(page);
5455                                 free_extent_map(em);
5456                                 em = NULL;
5457
5458                                 btrfs_release_path(path);
5459                                 trans = btrfs_join_transaction(root);
5460
5461                                 if (IS_ERR(trans))
5462                                         return ERR_CAST(trans);
5463                                 goto again;
5464                         }
5465                         map = kmap(page);
5466                         write_extent_buffer(leaf, map + pg_offset, ptr,
5467                                             copy_size);
5468                         kunmap(page);
5469                         btrfs_mark_buffer_dirty(leaf);
5470                 }
5471                 set_extent_uptodate(io_tree, em->start,
5472                                     extent_map_end(em) - 1, NULL, GFP_NOFS);
5473                 goto insert;
5474         } else {
5475                 WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
5476         }
5477 not_found:
5478         em->start = start;
5479         em->len = len;
5480 not_found_em:
5481         em->block_start = EXTENT_MAP_HOLE;
5482         set_bit(EXTENT_FLAG_VACANCY, &em->flags);
5483 insert:
5484         btrfs_release_path(path);
5485         if (em->start > start || extent_map_end(em) <= start) {
5486                 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
5487                        "[%llu %llu]\n", (unsigned long long)em->start,
5488                        (unsigned long long)em->len,
5489                        (unsigned long long)start,
5490                        (unsigned long long)len);
5491                 err = -EIO;
5492                 goto out;
5493         }
5494
5495         err = 0;
5496         write_lock(&em_tree->lock);
5497         ret = add_extent_mapping(em_tree, em);
5498         /* it is possible that someone inserted the extent into the tree
5499          * while we had the lock dropped.  It is also possible that
5500          * an overlapping map exists in the tree
5501          */
5502         if (ret == -EEXIST) {
5503                 struct extent_map *existing;
5504
5505                 ret = 0;
5506
5507                 existing = lookup_extent_mapping(em_tree, start, len);
5508                 if (existing && (existing->start > start ||
5509                     existing->start + existing->len <= start)) {
5510                         free_extent_map(existing);
5511                         existing = NULL;
5512                 }
5513                 if (!existing) {
5514                         existing = lookup_extent_mapping(em_tree, em->start,
5515                                                          em->len);
5516                         if (existing) {
5517                                 err = merge_extent_mapping(em_tree, existing,
5518                                                            em, start,
5519                                                            root->sectorsize);
5520                                 free_extent_map(existing);
5521                                 if (err) {
5522                                         free_extent_map(em);
5523                                         em = NULL;
5524                                 }
5525                         } else {
5526                                 err = -EIO;
5527                                 free_extent_map(em);
5528                                 em = NULL;
5529                         }
5530                 } else {
5531                         free_extent_map(em);
5532                         em = existing;
5533                         err = 0;
5534                 }
5535         }
5536         write_unlock(&em_tree->lock);
5537 out:
5538
5539         if (em)
5540                 trace_btrfs_get_extent(root, em);
5541
5542         if (path)
5543                 btrfs_free_path(path);
5544         if (trans) {
5545                 ret = btrfs_end_transaction(trans, root);
5546                 if (!err)
5547                         err = ret;
5548         }
5549         if (err) {
5550                 free_extent_map(em);
5551                 return ERR_PTR(err);
5552         }
5553         BUG_ON(!em); /* Error is always set */
5554         return em;
5555 }
5556
5557 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
5558                                            size_t pg_offset, u64 start, u64 len,
5559                                            int create)
5560 {
5561         struct extent_map *em;
5562         struct extent_map *hole_em = NULL;
5563         u64 range_start = start;
5564         u64 end;
5565         u64 found;
5566         u64 found_end;
5567         int err = 0;
5568
5569         em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
5570         if (IS_ERR(em))
5571                 return em;
5572         if (em) {
5573                 /*
5574                  * if our em maps to a hole, there might
5575                  * actually be delalloc bytes behind it
5576                  */
5577                 if (em->block_start != EXTENT_MAP_HOLE)
5578                         return em;
5579                 else
5580                         hole_em = em;
5581         }
5582
5583         /* check to see if we've wrapped (len == -1 or similar) */
5584         end = start + len;
5585         if (end < start)
5586                 end = (u64)-1;
5587         else
5588                 end -= 1;
5589
5590         em = NULL;
5591
5592         /* ok, we didn't find anything, lets look for delalloc */
5593         found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
5594                                  end, len, EXTENT_DELALLOC, 1);
5595         found_end = range_start + found;
5596         if (found_end < range_start)
5597                 found_end = (u64)-1;
5598
5599         /*
5600          * we didn't find anything useful, return
5601          * the original results from get_extent()
5602          */
5603         if (range_start > end || found_end <= start) {
5604                 em = hole_em;
5605                 hole_em = NULL;
5606                 goto out;
5607         }
5608
5609         /* adjust the range_start to make sure it doesn't
5610          * go backwards from the start they passed in
5611          */
5612         range_start = max(start,range_start);
5613         found = found_end - range_start;
5614
5615         if (found > 0) {
5616                 u64 hole_start = start;
5617                 u64 hole_len = len;
5618
5619                 em = alloc_extent_map();
5620                 if (!em) {
5621                         err = -ENOMEM;
5622                         goto out;
5623                 }
5624                 /*
5625                  * when btrfs_get_extent can't find anything it
5626                  * returns one huge hole
5627                  *
5628                  * make sure what it found really fits our range, and
5629                  * adjust to make sure it is based on the start from
5630                  * the caller
5631                  */
5632                 if (hole_em) {
5633                         u64 calc_end = extent_map_end(hole_em);
5634
5635                         if (calc_end <= start || (hole_em->start > end)) {
5636                                 free_extent_map(hole_em);
5637                                 hole_em = NULL;
5638                         } else {
5639                                 hole_start = max(hole_em->start, start);
5640                                 hole_len = calc_end - hole_start;
5641                         }
5642                 }
5643                 em->bdev = NULL;
5644                 if (hole_em && range_start > hole_start) {
5645                         /* our hole starts before our delalloc, so we
5646                          * have to return just the parts of the hole
5647                          * that go until  the delalloc starts
5648                          */
5649                         em->len = min(hole_len,
5650                                       range_start - hole_start);
5651                         em->start = hole_start;
5652                         em->orig_start = hole_start;
5653                         /*
5654                          * don't adjust block start at all,
5655                          * it is fixed at EXTENT_MAP_HOLE
5656                          */
5657                         em->block_start = hole_em->block_start;
5658                         em->block_len = hole_len;
5659                 } else {
5660                         em->start = range_start;
5661                         em->len = found;
5662                         em->orig_start = range_start;
5663                         em->block_start = EXTENT_MAP_DELALLOC;
5664                         em->block_len = found;
5665                 }
5666         } else if (hole_em) {
5667                 return hole_em;
5668         }
5669 out:
5670
5671         free_extent_map(hole_em);
5672         if (err) {
5673                 free_extent_map(em);
5674                 return ERR_PTR(err);
5675         }
5676         return em;
5677 }
5678
5679 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5680                                                   struct extent_map *em,
5681                                                   u64 start, u64 len)
5682 {
5683         struct btrfs_root *root = BTRFS_I(inode)->root;
5684         struct btrfs_trans_handle *trans;
5685         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5686         struct btrfs_key ins;
5687         u64 alloc_hint;
5688         int ret;
5689         bool insert = false;
5690
5691         /*
5692          * Ok if the extent map we looked up is a hole and is for the exact
5693          * range we want, there is no reason to allocate a new one, however if
5694          * it is not right then we need to free this one and drop the cache for
5695          * our range.
5696          */
5697         if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
5698             em->len != len) {
5699                 free_extent_map(em);
5700                 em = NULL;
5701                 insert = true;
5702                 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5703         }
5704
5705         trans = btrfs_join_transaction(root);
5706         if (IS_ERR(trans))
5707                 return ERR_CAST(trans);
5708
5709         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5710
5711         alloc_hint = get_extent_allocation_hint(inode, start, len);
5712         ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5713                                    alloc_hint, &ins, 1);
5714         if (ret) {
5715                 em = ERR_PTR(ret);
5716                 goto out;
5717         }
5718
5719         if (!em) {
5720                 em = alloc_extent_map();
5721                 if (!em) {
5722                         em = ERR_PTR(-ENOMEM);
5723                         goto out;
5724                 }
5725         }
5726
5727         em->start = start;
5728         em->orig_start = em->start;
5729         em->len = ins.offset;
5730
5731         em->block_start = ins.objectid;
5732         em->block_len = ins.offset;
5733         em->orig_block_len = ins.offset;
5734         em->bdev = root->fs_info->fs_devices->latest_bdev;
5735
5736         /*
5737          * We need to do this because if we're using the original em we searched
5738          * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
5739          */
5740         em->flags = 0;
5741         set_bit(EXTENT_FLAG_PINNED, &em->flags);
5742
5743         while (insert) {
5744                 write_lock(&em_tree->lock);
5745                 ret = add_extent_mapping(em_tree, em);
5746                 write_unlock(&em_tree->lock);
5747                 if (ret != -EEXIST)
5748                         break;
5749                 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5750         }
5751
5752         ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5753                                            ins.offset, ins.offset, 0);
5754         if (ret) {
5755                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5756                 em = ERR_PTR(ret);
5757         }
5758 out:
5759         btrfs_end_transaction(trans, root);
5760         return em;
5761 }
5762
5763 /*
5764  * returns 1 when the nocow is safe, < 1 on error, 0 if the
5765  * block must be cow'd
5766  */
5767 static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5768                                       struct inode *inode, u64 offset, u64 len)
5769 {
5770         struct btrfs_path *path;
5771         int ret;
5772         struct extent_buffer *leaf;
5773         struct btrfs_root *root = BTRFS_I(inode)->root;
5774         struct btrfs_file_extent_item *fi;
5775         struct btrfs_key key;
5776         u64 disk_bytenr;
5777         u64 backref_offset;
5778         u64 extent_end;
5779         u64 num_bytes;
5780         int slot;
5781         int found_type;
5782
5783         path = btrfs_alloc_path();
5784         if (!path)
5785                 return -ENOMEM;
5786
5787         ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
5788                                        offset, 0);
5789         if (ret < 0)
5790                 goto out;
5791
5792         slot = path->slots[0];
5793         if (ret == 1) {
5794                 if (slot == 0) {
5795                         /* can't find the item, must cow */
5796                         ret = 0;
5797                         goto out;
5798                 }
5799                 slot--;
5800         }
5801         ret = 0;
5802         leaf = path->nodes[0];
5803         btrfs_item_key_to_cpu(leaf, &key, slot);
5804         if (key.objectid != btrfs_ino(inode) ||
5805             key.type != BTRFS_EXTENT_DATA_KEY) {
5806                 /* not our file or wrong item type, must cow */
5807                 goto out;
5808         }
5809
5810         if (key.offset > offset) {
5811                 /* Wrong offset, must cow */
5812                 goto out;
5813         }
5814
5815         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5816         found_type = btrfs_file_extent_type(leaf, fi);
5817         if (found_type != BTRFS_FILE_EXTENT_REG &&
5818             found_type != BTRFS_FILE_EXTENT_PREALLOC) {
5819                 /* not a regular extent, must cow */
5820                 goto out;
5821         }
5822         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5823         backref_offset = btrfs_file_extent_offset(leaf, fi);
5824
5825         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
5826         if (extent_end < offset + len) {
5827                 /* extent doesn't include our full range, must cow */
5828                 goto out;
5829         }
5830
5831         if (btrfs_extent_readonly(root, disk_bytenr))
5832                 goto out;
5833
5834         /*
5835          * look for other files referencing this extent, if we
5836          * find any we must cow
5837          */
5838         if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
5839                                   key.offset - backref_offset, disk_bytenr))
5840                 goto out;
5841
5842         /*
5843          * adjust disk_bytenr and num_bytes to cover just the bytes
5844          * in this extent we are about to write.  If there
5845          * are any csums in that range we have to cow in order
5846          * to keep the csums correct
5847          */
5848         disk_bytenr += backref_offset;
5849         disk_bytenr += offset - key.offset;
5850         num_bytes = min(offset + len, extent_end) - offset;
5851         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
5852                                 goto out;
5853         /*
5854          * all of the above have passed, it is safe to overwrite this extent
5855          * without cow
5856          */
5857         ret = 1;
5858 out:
5859         btrfs_free_path(path);
5860         return ret;
5861 }
5862
5863 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
5864                               struct extent_state **cached_state, int writing)
5865 {
5866         struct btrfs_ordered_extent *ordered;
5867         int ret = 0;
5868
5869         while (1) {
5870                 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5871                                  0, cached_state);
5872                 /*
5873                  * We're concerned with the entire range that we're going to be
5874                  * doing DIO to, so we need to make sure theres no ordered
5875                  * extents in this range.
5876                  */
5877                 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5878                                                      lockend - lockstart + 1);
5879
5880                 /*
5881                  * We need to make sure there are no buffered pages in this
5882                  * range either, we could have raced between the invalidate in
5883                  * generic_file_direct_write and locking the extent.  The
5884                  * invalidate needs to happen so that reads after a write do not
5885                  * get stale data.
5886                  */
5887                 if (!ordered && (!writing ||
5888                     !test_range_bit(&BTRFS_I(inode)->io_tree,
5889                                     lockstart, lockend, EXTENT_UPTODATE, 0,
5890                                     *cached_state)))
5891                         break;
5892
5893                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5894                                      cached_state, GFP_NOFS);
5895
5896                 if (ordered) {
5897                         btrfs_start_ordered_extent(inode, ordered, 1);
5898                         btrfs_put_ordered_extent(ordered);
5899                 } else {
5900                         /* Screw you mmap */
5901                         ret = filemap_write_and_wait_range(inode->i_mapping,
5902                                                            lockstart,
5903                                                            lockend);
5904                         if (ret)
5905                                 break;
5906
5907                         /*
5908                          * If we found a page that couldn't be invalidated just
5909                          * fall back to buffered.
5910                          */
5911                         ret = invalidate_inode_pages2_range(inode->i_mapping,
5912                                         lockstart >> PAGE_CACHE_SHIFT,
5913                                         lockend >> PAGE_CACHE_SHIFT);
5914                         if (ret)
5915                                 break;
5916                 }
5917
5918                 cond_resched();
5919         }
5920
5921         return ret;
5922 }
5923
5924 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5925                                            u64 len, u64 orig_start,
5926                                            u64 block_start, u64 block_len,
5927                                            u64 orig_block_len, int type)
5928 {
5929         struct extent_map_tree *em_tree;
5930         struct extent_map *em;
5931         struct btrfs_root *root = BTRFS_I(inode)->root;
5932         int ret;
5933
5934         em_tree = &BTRFS_I(inode)->extent_tree;
5935         em = alloc_extent_map();
5936         if (!em)
5937                 return ERR_PTR(-ENOMEM);
5938
5939         em->start = start;
5940         em->orig_start = orig_start;
5941         em->len = len;
5942         em->block_len = block_len;
5943         em->block_start = block_start;
5944         em->bdev = root->fs_info->fs_devices->latest_bdev;
5945         em->orig_block_len = orig_block_len;
5946         set_bit(EXTENT_FLAG_PINNED, &em->flags);
5947         if (type == BTRFS_ORDERED_PREALLOC)
5948                 set_bit(EXTENT_FLAG_FILLING, &em->flags);
5949
5950         do {
5951                 btrfs_drop_extent_cache(inode, em->start,
5952                                 em->start + em->len - 1, 0);
5953                 write_lock(&em_tree->lock);
5954                 ret = add_extent_mapping(em_tree, em);
5955                 write_unlock(&em_tree->lock);
5956         } while (ret == -EEXIST);
5957
5958         if (ret) {
5959                 free_extent_map(em);
5960                 return ERR_PTR(ret);
5961         }
5962
5963         return em;
5964 }
5965
5966
5967 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5968                                    struct buffer_head *bh_result, int create)
5969 {
5970         struct extent_map *em;
5971         struct btrfs_root *root = BTRFS_I(inode)->root;
5972         struct extent_state *cached_state = NULL;
5973         u64 start = iblock << inode->i_blkbits;
5974         u64 lockstart, lockend;
5975         u64 len = bh_result->b_size;
5976         struct btrfs_trans_handle *trans;
5977         int unlock_bits = EXTENT_LOCKED;
5978         int ret;
5979
5980         if (create) {
5981                 ret = btrfs_delalloc_reserve_space(inode, len);
5982                 if (ret)
5983                         return ret;
5984                 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
5985         } else {
5986                 len = min_t(u64, len, root->sectorsize);
5987         }
5988
5989         lockstart = start;
5990         lockend = start + len - 1;
5991
5992         /*
5993          * If this errors out it's because we couldn't invalidate pagecache for
5994          * this range and we need to fallback to buffered.
5995          */
5996         if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
5997                 return -ENOTBLK;
5998
5999         if (create) {
6000                 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6001                                      lockend, EXTENT_DELALLOC, NULL,
6002                                      &cached_state, GFP_NOFS);
6003                 if (ret)
6004                         goto unlock_err;
6005         }
6006
6007         em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
6008         if (IS_ERR(em)) {
6009                 ret = PTR_ERR(em);
6010                 goto unlock_err;
6011         }
6012
6013         /*
6014          * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
6015          * io.  INLINE is special, and we could probably kludge it in here, but
6016          * it's still buffered so for safety lets just fall back to the generic
6017          * buffered path.
6018          *
6019          * For COMPRESSED we _have_ to read the entire extent in so we can
6020          * decompress it, so there will be buffering required no matter what we
6021          * do, so go ahead and fallback to buffered.
6022          *
6023          * We return -ENOTBLK because thats what makes DIO go ahead and go back
6024          * to buffered IO.  Don't blame me, this is the price we pay for using
6025          * the generic code.
6026          */
6027         if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
6028             em->block_start == EXTENT_MAP_INLINE) {
6029                 free_extent_map(em);
6030                 ret = -ENOTBLK;
6031                 goto unlock_err;
6032         }
6033
6034         /* Just a good old fashioned hole, return */
6035         if (!create && (em->block_start == EXTENT_MAP_HOLE ||
6036                         test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
6037                 free_extent_map(em);
6038                 ret = 0;
6039                 goto unlock_err;
6040         }
6041
6042         /*
6043          * We don't allocate a new extent in the following cases
6044          *
6045          * 1) The inode is marked as NODATACOW.  In this case we'll just use the
6046          * existing extent.
6047          * 2) The extent is marked as PREALLOC.  We're good to go here and can
6048          * just use the extent.
6049          *
6050          */
6051         if (!create) {
6052                 len = min(len, em->len - (start - em->start));
6053                 lockstart = start + len;
6054                 goto unlock;
6055         }
6056
6057         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
6058             ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
6059              em->block_start != EXTENT_MAP_HOLE)) {
6060                 int type;
6061                 int ret;
6062                 u64 block_start;
6063
6064                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6065                         type = BTRFS_ORDERED_PREALLOC;
6066                 else
6067                         type = BTRFS_ORDERED_NOCOW;
6068                 len = min(len, em->len - (start - em->start));
6069                 block_start = em->block_start + (start - em->start);
6070
6071                 /*
6072                  * we're not going to log anything, but we do need
6073                  * to make sure the current transaction stays open
6074                  * while we look for nocow cross refs
6075                  */
6076                 trans = btrfs_join_transaction(root);
6077                 if (IS_ERR(trans))
6078                         goto must_cow;
6079
6080                 if (can_nocow_odirect(trans, inode, start, len) == 1) {
6081                         u64 orig_start = em->start;
6082                         u64 orig_block_len = em->orig_block_len;
6083
6084                         if (type == BTRFS_ORDERED_PREALLOC) {
6085                                 free_extent_map(em);
6086                                 em = create_pinned_em(inode, start, len,
6087                                                        orig_start,
6088                                                        block_start, len,
6089                                                        orig_block_len, type);
6090                                 if (IS_ERR(em)) {
6091                                         btrfs_end_transaction(trans, root);
6092                                         goto unlock_err;
6093                                 }
6094                         }
6095
6096                         ret = btrfs_add_ordered_extent_dio(inode, start,
6097                                            block_start, len, len, type);
6098                         btrfs_end_transaction(trans, root);
6099                         if (ret) {
6100                                 free_extent_map(em);
6101                                 goto unlock_err;
6102                         }
6103                         goto unlock;
6104                 }
6105                 btrfs_end_transaction(trans, root);
6106         }
6107 must_cow:
6108         /*
6109          * this will cow the extent, reset the len in case we changed
6110          * it above
6111          */
6112         len = bh_result->b_size;
6113         em = btrfs_new_extent_direct(inode, em, start, len);
6114         if (IS_ERR(em)) {
6115                 ret = PTR_ERR(em);
6116                 goto unlock_err;
6117         }
6118         len = min(len, em->len - (start - em->start));
6119 unlock:
6120         bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
6121                 inode->i_blkbits;
6122         bh_result->b_size = len;
6123         bh_result->b_bdev = em->bdev;
6124         set_buffer_mapped(bh_result);
6125         if (create) {
6126                 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6127                         set_buffer_new(bh_result);
6128
6129                 /*
6130                  * Need to update the i_size under the extent lock so buffered
6131                  * readers will get the updated i_size when we unlock.
6132                  */
6133                 if (start + len > i_size_read(inode))
6134                         i_size_write(inode, start + len);
6135         }
6136
6137         /*
6138          * In the case of write we need to clear and unlock the entire range,
6139          * in the case of read we need to unlock only the end area that we
6140          * aren't using if there is any left over space.
6141          */
6142         if (lockstart < lockend) {
6143                 if (create && len < lockend - lockstart) {
6144                         clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6145                                          lockstart + len - 1,
6146                                          unlock_bits | EXTENT_DEFRAG, 1, 0,
6147                                          &cached_state, GFP_NOFS);
6148                         /*
6149                          * Beside unlock, we also need to cleanup reserved space
6150                          * for the left range by attaching EXTENT_DO_ACCOUNTING.
6151                          */
6152                         clear_extent_bit(&BTRFS_I(inode)->io_tree,
6153                                          lockstart + len, lockend,
6154                                          unlock_bits | EXTENT_DO_ACCOUNTING |
6155                                          EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
6156                 } else {
6157                         clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6158                                          lockend, unlock_bits, 1, 0,
6159                                          &cached_state, GFP_NOFS);
6160                 }
6161         } else {
6162                 free_extent_state(cached_state);
6163         }
6164
6165         free_extent_map(em);
6166
6167         return 0;
6168
6169 unlock_err:
6170         if (create)
6171                 unlock_bits |= EXTENT_DO_ACCOUNTING;
6172
6173         clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6174                          unlock_bits, 1, 0, &cached_state, GFP_NOFS);
6175         return ret;
6176 }
6177
6178 struct btrfs_dio_private {
6179         struct inode *inode;
6180         u64 logical_offset;
6181         u64 disk_bytenr;
6182         u64 bytes;
6183         void *private;
6184
6185         /* number of bios pending for this dio */
6186         atomic_t pending_bios;
6187
6188         /* IO errors */
6189         int errors;
6190
6191         struct bio *orig_bio;
6192 };
6193
6194 static void btrfs_endio_direct_read(struct bio *bio, int err)
6195 {
6196         struct btrfs_dio_private *dip = bio->bi_private;
6197         struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
6198         struct bio_vec *bvec = bio->bi_io_vec;
6199         struct inode *inode = dip->inode;
6200         struct btrfs_root *root = BTRFS_I(inode)->root;
6201         u64 start;
6202
6203         start = dip->logical_offset;
6204         do {
6205                 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
6206                         struct page *page = bvec->bv_page;
6207                         char *kaddr;
6208                         u32 csum = ~(u32)0;
6209                         u64 private = ~(u32)0;
6210                         unsigned long flags;
6211
6212                         if (get_state_private(&BTRFS_I(inode)->io_tree,
6213                                               start, &private))
6214                                 goto failed;
6215                         local_irq_save(flags);
6216                         kaddr = kmap_atomic(page);
6217                         csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
6218                                                csum, bvec->bv_len);
6219                         btrfs_csum_final(csum, (char *)&csum);
6220                         kunmap_atomic(kaddr);
6221                         local_irq_restore(flags);
6222
6223                         flush_dcache_page(bvec->bv_page);
6224                         if (csum != private) {
6225 failed:
6226                                 printk(KERN_ERR "btrfs csum failed ino %llu off"
6227                                       " %llu csum %u private %u\n",
6228                                       (unsigned long long)btrfs_ino(inode),
6229                                       (unsigned long long)start,
6230                                       csum, (unsigned)private);
6231                                 err = -EIO;
6232                         }
6233                 }
6234
6235                 start += bvec->bv_len;
6236                 bvec++;
6237         } while (bvec <= bvec_end);
6238
6239         unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
6240                       dip->logical_offset + dip->bytes - 1);
6241         bio->bi_private = dip->private;
6242
6243         kfree(dip);
6244
6245         /* If we had a csum failure make sure to clear the uptodate flag */
6246         if (err)
6247                 clear_bit(BIO_UPTODATE, &bio->bi_flags);
6248         dio_end_io(bio, err);
6249 }
6250
6251 static void btrfs_endio_direct_write(struct bio *bio, int err)
6252 {
6253         struct btrfs_dio_private *dip = bio->bi_private;
6254         struct inode *inode = dip->inode;
6255         struct btrfs_root *root = BTRFS_I(inode)->root;
6256         struct btrfs_ordered_extent *ordered = NULL;
6257         u64 ordered_offset = dip->logical_offset;
6258         u64 ordered_bytes = dip->bytes;
6259         int ret;
6260
6261         if (err)
6262                 goto out_done;
6263 again:
6264         ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
6265                                                    &ordered_offset,
6266                                                    ordered_bytes, !err);
6267         if (!ret)
6268                 goto out_test;
6269
6270         ordered->work.func = finish_ordered_fn;
6271         ordered->work.flags = 0;
6272         btrfs_queue_worker(&root->fs_info->endio_write_workers,
6273                            &ordered->work);
6274 out_test:
6275         /*
6276          * our bio might span multiple ordered extents.  If we haven't
6277          * completed the accounting for the whole dio, go back and try again
6278          */
6279         if (ordered_offset < dip->logical_offset + dip->bytes) {
6280                 ordered_bytes = dip->logical_offset + dip->bytes -
6281                         ordered_offset;
6282                 ordered = NULL;
6283                 goto again;
6284         }
6285 out_done:
6286         bio->bi_private = dip->private;
6287
6288         kfree(dip);
6289
6290         /* If we had an error make sure to clear the uptodate flag */
6291         if (err)
6292                 clear_bit(BIO_UPTODATE, &bio->bi_flags);
6293         dio_end_io(bio, err);
6294 }
6295
6296 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
6297                                     struct bio *bio, int mirror_num,
6298                                     unsigned long bio_flags, u64 offset)
6299 {
6300         int ret;
6301         struct btrfs_root *root = BTRFS_I(inode)->root;
6302         ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
6303         BUG_ON(ret); /* -ENOMEM */
6304         return 0;
6305 }
6306
6307 static void btrfs_end_dio_bio(struct bio *bio, int err)
6308 {
6309         struct btrfs_dio_private *dip = bio->bi_private;
6310
6311         if (err) {
6312                 printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
6313                       "sector %#Lx len %u err no %d\n",
6314                       (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw,
6315                       (unsigned long long)bio->bi_sector, bio->bi_size, err);
6316                 dip->errors = 1;
6317
6318                 /*
6319                  * before atomic variable goto zero, we must make sure
6320                  * dip->errors is perceived to be set.
6321                  */
6322                 smp_mb__before_atomic_dec();
6323         }
6324
6325         /* if there are more bios still pending for this dio, just exit */
6326         if (!atomic_dec_and_test(&dip->pending_bios))
6327                 goto out;
6328
6329         if (dip->errors)
6330                 bio_io_error(dip->orig_bio);
6331         else {
6332                 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
6333                 bio_endio(dip->orig_bio, 0);
6334         }
6335 out:
6336         bio_put(bio);
6337 }
6338
6339 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
6340                                        u64 first_sector, gfp_t gfp_flags)
6341 {
6342         int nr_vecs = bio_get_nr_vecs(bdev);
6343         return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
6344 }
6345
6346 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6347                                          int rw, u64 file_offset, int skip_sum,
6348                                          int async_submit)
6349 {
6350         int write = rw & REQ_WRITE;
6351         struct btrfs_root *root = BTRFS_I(inode)->root;
6352         int ret;
6353
6354         if (async_submit)
6355                 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
6356
6357         bio_get(bio);
6358
6359         if (!write) {
6360                 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
6361                 if (ret)
6362                         goto err;
6363         }
6364
6365         if (skip_sum)
6366                 goto map;
6367
6368         if (write && async_submit) {
6369                 ret = btrfs_wq_submit_bio(root->fs_info,
6370                                    inode, rw, bio, 0, 0,
6371                                    file_offset,
6372                                    __btrfs_submit_bio_start_direct_io,
6373                                    __btrfs_submit_bio_done);
6374                 goto err;
6375         } else if (write) {
6376                 /*
6377                  * If we aren't doing async submit, calculate the csum of the
6378                  * bio now.
6379                  */
6380                 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
6381                 if (ret)
6382                         goto err;
6383         } else if (!skip_sum) {
6384                 ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset);
6385                 if (ret)
6386                         goto err;
6387         }
6388
6389 map:
6390         ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
6391 err:
6392         bio_put(bio);
6393         return ret;
6394 }
6395
6396 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6397                                     int skip_sum)
6398 {
6399         struct inode *inode = dip->inode;
6400         struct btrfs_root *root = BTRFS_I(inode)->root;
6401         struct bio *bio;
6402         struct bio *orig_bio = dip->orig_bio;
6403         struct bio_vec *bvec = orig_bio->bi_io_vec;
6404         u64 start_sector = orig_bio->bi_sector;
6405         u64 file_offset = dip->logical_offset;
6406         u64 submit_len = 0;
6407         u64 map_length;
6408         int nr_pages = 0;
6409         int ret = 0;
6410         int async_submit = 0;
6411
6412         map_length = orig_bio->bi_size;
6413         ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
6414                               &map_length, NULL, 0);
6415         if (ret) {
6416                 bio_put(orig_bio);
6417                 return -EIO;
6418         }
6419
6420         if (map_length >= orig_bio->bi_size) {
6421                 bio = orig_bio;
6422                 goto submit;
6423         }
6424
6425         async_submit = 1;
6426         bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
6427         if (!bio)
6428                 return -ENOMEM;
6429         bio->bi_private = dip;
6430         bio->bi_end_io = btrfs_end_dio_bio;
6431         atomic_inc(&dip->pending_bios);
6432
6433         while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
6434                 if (unlikely(map_length < submit_len + bvec->bv_len ||
6435                     bio_add_page(bio, bvec->bv_page, bvec->bv_len,
6436                                  bvec->bv_offset) < bvec->bv_len)) {
6437                         /*
6438                          * inc the count before we submit the bio so
6439                          * we know the end IO handler won't happen before
6440                          * we inc the count. Otherwise, the dip might get freed
6441                          * before we're done setting it up
6442                          */
6443                         atomic_inc(&dip->pending_bios);
6444                         ret = __btrfs_submit_dio_bio(bio, inode, rw,
6445                                                      file_offset, skip_sum,
6446                                                      async_submit);
6447                         if (ret) {
6448                                 bio_put(bio);
6449                                 atomic_dec(&dip->pending_bios);
6450                                 goto out_err;
6451                         }
6452
6453                         start_sector += submit_len >> 9;
6454                         file_offset += submit_len;
6455
6456                         submit_len = 0;
6457                         nr_pages = 0;
6458
6459                         bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
6460                                                   start_sector, GFP_NOFS);
6461                         if (!bio)
6462                                 goto out_err;
6463                         bio->bi_private = dip;
6464                         bio->bi_end_io = btrfs_end_dio_bio;
6465
6466                         map_length = orig_bio->bi_size;
6467                         ret = btrfs_map_block(root->fs_info, READ,
6468                                               start_sector << 9,
6469                                               &map_length, NULL, 0);
6470                         if (ret) {
6471                                 bio_put(bio);
6472                                 goto out_err;
6473                         }
6474                 } else {
6475                         submit_len += bvec->bv_len;
6476                         nr_pages ++;
6477                         bvec++;
6478                 }
6479         }
6480
6481 submit:
6482         ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
6483                                      async_submit);
6484         if (!ret)
6485                 return 0;
6486
6487         bio_put(bio);
6488 out_err:
6489         dip->errors = 1;
6490         /*
6491          * before atomic variable goto zero, we must
6492          * make sure dip->errors is perceived to be set.
6493          */
6494         smp_mb__before_atomic_dec();
6495         if (atomic_dec_and_test(&dip->pending_bios))
6496                 bio_io_error(dip->orig_bio);
6497
6498         /* bio_end_io() will handle error, so we needn't return it */
6499         return 0;
6500 }
6501
6502 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
6503                                 loff_t file_offset)
6504 {
6505         struct btrfs_root *root = BTRFS_I(inode)->root;
6506         struct btrfs_dio_private *dip;
6507         struct bio_vec *bvec = bio->bi_io_vec;
6508         int skip_sum;
6509         int write = rw & REQ_WRITE;
6510         int ret = 0;
6511
6512         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
6513
6514         dip = kmalloc(sizeof(*dip), GFP_NOFS);
6515         if (!dip) {
6516                 ret = -ENOMEM;
6517                 goto free_ordered;
6518         }
6519
6520         dip->private = bio->bi_private;
6521         dip->inode = inode;
6522         dip->logical_offset = file_offset;
6523
6524         dip->bytes = 0;
6525         do {
6526                 dip->bytes += bvec->bv_len;
6527                 bvec++;
6528         } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
6529
6530         dip->disk_bytenr = (u64)bio->bi_sector << 9;
6531         bio->bi_private = dip;
6532         dip->errors = 0;
6533         dip->orig_bio = bio;
6534         atomic_set(&dip->pending_bios, 0);
6535
6536         if (write)
6537                 bio->bi_end_io = btrfs_endio_direct_write;
6538         else
6539                 bio->bi_end_io = btrfs_endio_direct_read;
6540
6541         ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
6542         if (!ret)
6543                 return;
6544 free_ordered:
6545         /*
6546          * If this is a write, we need to clean up the reserved space and kill
6547          * the ordered extent.
6548          */
6549         if (write) {
6550                 struct btrfs_ordered_extent *ordered;
6551                 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
6552                 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
6553                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
6554                         btrfs_free_reserved_extent(root, ordered->start,
6555                                                    ordered->disk_len);
6556                 btrfs_put_ordered_extent(ordered);
6557                 btrfs_put_ordered_extent(ordered);
6558         }
6559         bio_endio(bio, ret);
6560 }
6561
6562 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
6563                         const struct iovec *iov, loff_t offset,
6564                         unsigned long nr_segs)
6565 {
6566         int seg;
6567         int i;
6568         size_t size;
6569         unsigned long addr;
6570         unsigned blocksize_mask = root->sectorsize - 1;
6571         ssize_t retval = -EINVAL;
6572         loff_t end = offset;
6573
6574         if (offset & blocksize_mask)
6575                 goto out;
6576
6577         /* Check the memory alignment.  Blocks cannot straddle pages */
6578         for (seg = 0; seg < nr_segs; seg++) {
6579                 addr = (unsigned long)iov[seg].iov_base;
6580                 size = iov[seg].iov_len;
6581                 end += size;
6582                 if ((addr & blocksize_mask) || (size & blocksize_mask))
6583                         goto out;
6584
6585                 /* If this is a write we don't need to check anymore */
6586                 if (rw & WRITE)
6587                         continue;
6588
6589                 /*
6590                  * Check to make sure we don't have duplicate iov_base's in this
6591                  * iovec, if so return EINVAL, otherwise we'll get csum errors
6592                  * when reading back.
6593                  */
6594                 for (i = seg + 1; i < nr_segs; i++) {
6595                         if (iov[seg].iov_base == iov[i].iov_base)
6596                                 goto out;
6597                 }
6598         }
6599         retval = 0;
6600 out:
6601         return retval;
6602 }
6603
6604 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6605                         const struct iovec *iov, loff_t offset,
6606                         unsigned long nr_segs)
6607 {
6608         struct file *file = iocb->ki_filp;
6609         struct inode *inode = file->f_mapping->host;
6610
6611         if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
6612                             offset, nr_segs))
6613                 return 0;
6614
6615         return __blockdev_direct_IO(rw, iocb, inode,
6616                    BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
6617                    iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
6618                    btrfs_submit_direct, 0);
6619 }
6620
6621 #define BTRFS_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC)
6622
6623 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
6624                 __u64 start, __u64 len)
6625 {
6626         int     ret;
6627
6628         ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
6629         if (ret)
6630                 return ret;
6631
6632         return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
6633 }
6634
6635 int btrfs_readpage(struct file *file, struct page *page)
6636 {
6637         struct extent_io_tree *tree;
6638         tree = &BTRFS_I(page->mapping->host)->io_tree;
6639         return extent_read_full_page(tree, page, btrfs_get_extent, 0);
6640 }
6641
6642 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
6643 {
6644         struct extent_io_tree *tree;
6645
6646
6647         if (current->flags & PF_MEMALLOC) {
6648                 redirty_page_for_writepage(wbc, page);
6649                 unlock_page(page);
6650                 return 0;
6651         }
6652         tree = &BTRFS_I(page->mapping->host)->io_tree;
6653         return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
6654 }
6655
6656 int btrfs_writepages(struct address_space *mapping,
6657                      struct writeback_control *wbc)
6658 {
6659         struct extent_io_tree *tree;
6660
6661         tree = &BTRFS_I(mapping->host)->io_tree;
6662         return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
6663 }
6664
6665 static int
6666 btrfs_readpages(struct file *file, struct address_space *mapping,
6667                 struct list_head *pages, unsigned nr_pages)
6668 {
6669         struct extent_io_tree *tree;
6670         tree = &BTRFS_I(mapping->host)->io_tree;
6671         return extent_readpages(tree, mapping, pages, nr_pages,
6672                                 btrfs_get_extent);
6673 }
6674 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
6675 {
6676         struct extent_io_tree *tree;
6677         struct extent_map_tree *map;
6678         int ret;
6679
6680         tree = &BTRFS_I(page->mapping->host)->io_tree;
6681         map = &BTRFS_I(page->mapping->host)->extent_tree;
6682         ret = try_release_extent_mapping(map, tree, page, gfp_flags);
6683         if (ret == 1) {
6684                 ClearPagePrivate(page);
6685                 set_page_private(page, 0);
6686                 page_cache_release(page);
6687         }
6688         return ret;
6689 }
6690
6691 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
6692 {
6693         if (PageWriteback(page) || PageDirty(page))
6694                 return 0;
6695         return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
6696 }
6697
6698 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6699 {
6700         struct inode *inode = page->mapping->host;
6701         struct extent_io_tree *tree;
6702         struct btrfs_ordered_extent *ordered;
6703         struct extent_state *cached_state = NULL;
6704         u64 page_start = page_offset(page);
6705         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
6706
6707         /*
6708          * we have the page locked, so new writeback can't start,
6709          * and the dirty bit won't be cleared while we are here.
6710          *
6711          * Wait for IO on this page so that we can safely clear
6712          * the PagePrivate2 bit and do ordered accounting
6713          */
6714         wait_on_page_writeback(page);
6715
6716         tree = &BTRFS_I(inode)->io_tree;
6717         if (offset) {
6718                 btrfs_releasepage(page, GFP_NOFS);
6719                 return;
6720         }
6721         lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
6722         ordered = btrfs_lookup_ordered_extent(inode,
6723                                            page_offset(page));
6724         if (ordered) {
6725                 /*
6726                  * IO on this page will never be started, so we need
6727                  * to account for any ordered extents now
6728                  */
6729                 clear_extent_bit(tree, page_start, page_end,
6730                                  EXTENT_DIRTY | EXTENT_DELALLOC |
6731                                  EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
6732                                  EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
6733                 /*
6734                  * whoever cleared the private bit is responsible
6735                  * for the finish_ordered_io
6736                  */
6737                 if (TestClearPagePrivate2(page) &&
6738                     btrfs_dec_test_ordered_pending(inode, &ordered, page_start,
6739                                                    PAGE_CACHE_SIZE, 1)) {
6740                         btrfs_finish_ordered_io(ordered);
6741                 }
6742                 btrfs_put_ordered_extent(ordered);
6743                 cached_state = NULL;
6744                 lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
6745         }
6746         clear_extent_bit(tree, page_start, page_end,
6747                  EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
6748                  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
6749                  &cached_state, GFP_NOFS);
6750         __btrfs_releasepage(page, GFP_NOFS);
6751
6752         ClearPageChecked(page);
6753         if (PagePrivate(page)) {
6754                 ClearPagePrivate(page);
6755                 set_page_private(page, 0);
6756                 page_cache_release(page);
6757         }
6758 }
6759
6760 /*
6761  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
6762  * called from a page fault handler when a page is first dirtied. Hence we must
6763  * be careful to check for EOF conditions here. We set the page up correctly
6764  * for a written page which means we get ENOSPC checking when writing into
6765  * holes and correct delalloc and unwritten extent mapping on filesystems that
6766  * support these features.
6767  *
6768  * We are not allowed to take the i_mutex here so we have to play games to
6769  * protect against truncate races as the page could now be beyond EOF.  Because
6770  * vmtruncate() writes the inode size before removing pages, once we have the
6771  * page lock we can determine safely if the page is beyond EOF. If it is not
6772  * beyond EOF, then the page is guaranteed safe against truncation until we
6773  * unlock the page.
6774  */
6775 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6776 {
6777         struct page *page = vmf->page;
6778         struct inode *inode = fdentry(vma->vm_file)->d_inode;
6779         struct btrfs_root *root = BTRFS_I(inode)->root;
6780         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6781         struct btrfs_ordered_extent *ordered;
6782         struct extent_state *cached_state = NULL;
6783         char *kaddr;
6784         unsigned long zero_start;
6785         loff_t size;
6786         int ret;
6787         int reserved = 0;
6788         u64 page_start;
6789         u64 page_end;
6790
6791         sb_start_pagefault(inode->i_sb);
6792         ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6793         if (!ret) {
6794                 ret = file_update_time(vma->vm_file);
6795                 reserved = 1;
6796         }
6797         if (ret) {
6798                 if (ret == -ENOMEM)
6799                         ret = VM_FAULT_OOM;
6800                 else /* -ENOSPC, -EIO, etc */
6801                         ret = VM_FAULT_SIGBUS;
6802                 if (reserved)
6803                         goto out;
6804                 goto out_noreserve;
6805         }
6806
6807         ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
6808 again:
6809         lock_page(page);
6810         size = i_size_read(inode);
6811         page_start = page_offset(page);
6812         page_end = page_start + PAGE_CACHE_SIZE - 1;
6813
6814         if ((page->mapping != inode->i_mapping) ||
6815             (page_start >= size)) {
6816                 /* page got truncated out from underneath us */
6817                 goto out_unlock;
6818         }
6819         wait_on_page_writeback(page);
6820
6821         lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
6822         set_page_extent_mapped(page);
6823
6824         /*
6825          * we can't set the delalloc bits if there are pending ordered
6826          * extents.  Drop our locks and wait for them to finish
6827          */
6828         ordered = btrfs_lookup_ordered_extent(inode, page_start);
6829         if (ordered) {
6830                 unlock_extent_cached(io_tree, page_start, page_end,
6831                                      &cached_state, GFP_NOFS);
6832                 unlock_page(page);
6833                 btrfs_start_ordered_extent(inode, ordered, 1);
6834                 btrfs_put_ordered_extent(ordered);
6835                 goto again;
6836         }
6837
6838         /*
6839          * XXX - page_mkwrite gets called every time the page is dirtied, even
6840          * if it was already dirty, so for space accounting reasons we need to
6841          * clear any delalloc bits for the range we are fixing to save.  There
6842          * is probably a better way to do this, but for now keep consistent with
6843          * prepare_pages in the normal write path.
6844          */
6845         clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
6846                           EXTENT_DIRTY | EXTENT_DELALLOC |
6847                           EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
6848                           0, 0, &cached_state, GFP_NOFS);
6849
6850         ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
6851                                         &cached_state);
6852         if (ret) {
6853                 unlock_extent_cached(io_tree, page_start, page_end,
6854                                      &cached_state, GFP_NOFS);
6855                 ret = VM_FAULT_SIGBUS;
6856                 goto out_unlock;
6857         }
6858         ret = 0;
6859
6860         /* page is wholly or partially inside EOF */
6861         if (page_start + PAGE_CACHE_SIZE > size)
6862                 zero_start = size & ~PAGE_CACHE_MASK;
6863         else
6864                 zero_start = PAGE_CACHE_SIZE;
6865
6866         if (zero_start != PAGE_CACHE_SIZE) {
6867                 kaddr = kmap(page);
6868                 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
6869                 flush_dcache_page(page);
6870                 kunmap(page);
6871         }
6872         ClearPageChecked(page);
6873         set_page_dirty(page);
6874         SetPageUptodate(page);
6875
6876         BTRFS_I(inode)->last_trans = root->fs_info->generation;
6877         BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
6878         BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
6879
6880         unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
6881
6882 out_unlock:
6883         if (!ret) {
6884                 sb_end_pagefault(inode->i_sb);
6885                 return VM_FAULT_LOCKED;
6886         }
6887         unlock_page(page);
6888 out:
6889         btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6890 out_noreserve:
6891         sb_end_pagefault(inode->i_sb);
6892         return ret;
6893 }
6894
6895 static int btrfs_truncate(struct inode *inode)
6896 {
6897         struct btrfs_root *root = BTRFS_I(inode)->root;
6898         struct btrfs_block_rsv *rsv;
6899         int ret;
6900         int err = 0;
6901         struct btrfs_trans_handle *trans;
6902         u64 mask = root->sectorsize - 1;
6903         u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6904
6905         ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
6906         if (ret)
6907                 return ret;
6908
6909         btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
6910         btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
6911
6912         /*
6913          * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
6914          * 3 things going on here
6915          *
6916          * 1) We need to reserve space for our orphan item and the space to
6917          * delete our orphan item.  Lord knows we don't want to have a dangling
6918          * orphan item because we didn't reserve space to remove it.
6919          *
6920          * 2) We need to reserve space to update our inode.
6921          *
6922          * 3) We need to have something to cache all the space that is going to
6923          * be free'd up by the truncate operation, but also have some slack
6924          * space reserved in case it uses space during the truncate (thank you
6925          * very much snapshotting).
6926          *
6927          * And we need these to all be seperate.  The fact is we can use alot of
6928          * space doing the truncate, and we have no earthly idea how much space
6929          * we will use, so we need the truncate reservation to be seperate so it
6930          * doesn't end up using space reserved for updating the inode or
6931          * removing the orphan item.  We also need to be able to stop the
6932          * transaction and start a new one, which means we need to be able to
6933          * update the inode several times, and we have no idea of knowing how
6934          * many times that will be, so we can't just reserve 1 item for the
6935          * entirety of the opration, so that has to be done seperately as well.
6936          * Then there is the orphan item, which does indeed need to be held on
6937          * to for the whole operation, and we need nobody to touch this reserved
6938          * space except the orphan code.
6939          *
6940          * So that leaves us with
6941          *
6942          * 1) root->orphan_block_rsv - for the orphan deletion.
6943          * 2) rsv - for the truncate reservation, which we will steal from the
6944          * transaction reservation.
6945          * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
6946          * updating the inode.
6947          */
6948         rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
6949         if (!rsv)
6950                 return -ENOMEM;
6951         rsv->size = min_size;
6952         rsv->failfast = 1;
6953
6954         /*
6955          * 1 for the truncate slack space
6956          * 1 for the orphan item we're going to add
6957          * 1 for the orphan item deletion
6958          * 1 for updating the inode.
6959          */
6960         trans = btrfs_start_transaction(root, 4);
6961         if (IS_ERR(trans)) {
6962                 err = PTR_ERR(trans);
6963                 goto out;
6964         }
6965
6966         /* Migrate the slack space for the truncate to our reserve */
6967         ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
6968                                       min_size);
6969         BUG_ON(ret);
6970
6971         ret = btrfs_orphan_add(trans, inode);
6972         if (ret) {
6973                 btrfs_end_transaction(trans, root);
6974                 goto out;
6975         }
6976
6977         /*
6978          * setattr is responsible for setting the ordered_data_close flag,
6979          * but that is only tested during the last file release.  That
6980          * could happen well after the next commit, leaving a great big
6981          * window where new writes may get lost if someone chooses to write
6982          * to this file after truncating to zero
6983          *
6984          * The inode doesn't have any dirty data here, and so if we commit
6985          * this is a noop.  If someone immediately starts writing to the inode
6986          * it is very likely we'll catch some of their writes in this
6987          * transaction, and the commit will find this file on the ordered
6988          * data list with good things to send down.
6989          *
6990          * This is a best effort solution, there is still a window where
6991          * using truncate to replace the contents of the file will
6992          * end up with a zero length file after a crash.
6993          */
6994         if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
6995                                            &BTRFS_I(inode)->runtime_flags))
6996                 btrfs_add_ordered_operation(trans, root, inode);
6997
6998         /*
6999          * So if we truncate and then write and fsync we normally would just
7000          * write the extents that changed, which is a problem if we need to
7001          * first truncate that entire inode.  So set this flag so we write out
7002          * all of the extents in the inode to the sync log so we're completely
7003          * safe.
7004          */
7005         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
7006         trans->block_rsv = rsv;
7007
7008         while (1) {
7009                 ret = btrfs_truncate_inode_items(trans, root, inode,
7010                                                  inode->i_size,
7011                                                  BTRFS_EXTENT_DATA_KEY);
7012                 if (ret != -ENOSPC) {
7013                         err = ret;
7014                         break;
7015                 }
7016
7017                 trans->block_rsv = &root->fs_info->trans_block_rsv;
7018                 ret = btrfs_update_inode(trans, root, inode);
7019                 if (ret) {
7020                         err = ret;
7021                         break;
7022                 }
7023
7024                 btrfs_end_transaction(trans, root);
7025                 btrfs_btree_balance_dirty(root);
7026
7027                 trans = btrfs_start_transaction(root, 2);
7028                 if (IS_ERR(trans)) {
7029                         ret = err = PTR_ERR(trans);
7030                         trans = NULL;
7031                         break;
7032                 }
7033
7034                 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
7035                                               rsv, min_size);
7036                 BUG_ON(ret);    /* shouldn't happen */
7037                 trans->block_rsv = rsv;
7038         }
7039
7040         if (ret == 0 && inode->i_nlink > 0) {
7041                 trans->block_rsv = root->orphan_block_rsv;
7042                 ret = btrfs_orphan_del(trans, inode);
7043                 if (ret)
7044                         err = ret;
7045         } else if (ret && inode->i_nlink > 0) {
7046                 /*
7047                  * Failed to do the truncate, remove us from the in memory
7048                  * orphan list.
7049                  */
7050                 ret = btrfs_orphan_del(NULL, inode);
7051         }
7052
7053         if (trans) {
7054                 trans->block_rsv = &root->fs_info->trans_block_rsv;
7055                 ret = btrfs_update_inode(trans, root, inode);
7056                 if (ret && !err)
7057                         err = ret;
7058
7059                 ret = btrfs_end_transaction(trans, root);
7060                 btrfs_btree_balance_dirty(root);
7061         }
7062
7063 out:
7064         btrfs_free_block_rsv(root, rsv);
7065
7066         if (ret && !err)
7067                 err = ret;
7068
7069         return err;
7070 }
7071
7072 /*
7073  * create a new subvolume directory/inode (helper for the ioctl).
7074  */
7075 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
7076                              struct btrfs_root *new_root, u64 new_dirid)
7077 {
7078         struct inode *inode;
7079         int err;
7080         u64 index = 0;
7081
7082         inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
7083                                 new_dirid, new_dirid,
7084                                 S_IFDIR | (~current_umask() & S_IRWXUGO),
7085                                 &index);
7086         if (IS_ERR(inode))
7087                 return PTR_ERR(inode);
7088         inode->i_op = &btrfs_dir_inode_operations;
7089         inode->i_fop = &btrfs_dir_file_operations;
7090
7091         set_nlink(inode, 1);
7092         btrfs_i_size_write(inode, 0);
7093
7094         err = btrfs_update_inode(trans, new_root, inode);
7095
7096         iput(inode);
7097         return err;
7098 }
7099
7100 struct inode *btrfs_alloc_inode(struct super_block *sb)
7101 {
7102         struct btrfs_inode *ei;
7103         struct inode *inode;
7104
7105         ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
7106         if (!ei)
7107                 return NULL;
7108
7109         ei->root = NULL;
7110         ei->generation = 0;
7111         ei->last_trans = 0;
7112         ei->last_sub_trans = 0;
7113         ei->logged_trans = 0;
7114         ei->delalloc_bytes = 0;
7115         ei->disk_i_size = 0;
7116         ei->flags = 0;
7117         ei->csum_bytes = 0;
7118         ei->index_cnt = (u64)-1;
7119         ei->last_unlink_trans = 0;
7120         ei->last_log_commit = 0;
7121
7122         spin_lock_init(&ei->lock);
7123         ei->outstanding_extents = 0;
7124         ei->reserved_extents = 0;
7125
7126         ei->runtime_flags = 0;
7127         ei->force_compress = BTRFS_COMPRESS_NONE;
7128
7129         ei->delayed_node = NULL;
7130
7131         inode = &ei->vfs_inode;
7132         extent_map_tree_init(&ei->extent_tree);
7133         extent_io_tree_init(&ei->io_tree, &inode->i_data);
7134         extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
7135         ei->io_tree.track_uptodate = 1;
7136         ei->io_failure_tree.track_uptodate = 1;
7137         atomic_set(&ei->sync_writers, 0);
7138         mutex_init(&ei->log_mutex);
7139         mutex_init(&ei->delalloc_mutex);
7140         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
7141         INIT_LIST_HEAD(&ei->delalloc_inodes);
7142         INIT_LIST_HEAD(&ei->ordered_operations);
7143         RB_CLEAR_NODE(&ei->rb_node);
7144
7145         return inode;
7146 }
7147
7148 static void btrfs_i_callback(struct rcu_head *head)
7149 {
7150         struct inode *inode = container_of(head, struct inode, i_rcu);
7151         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7152 }
7153
7154 void btrfs_destroy_inode(struct inode *inode)
7155 {
7156         struct btrfs_ordered_extent *ordered;
7157         struct btrfs_root *root = BTRFS_I(inode)->root;
7158
7159         WARN_ON(!hlist_empty(&inode->i_dentry));
7160         WARN_ON(inode->i_data.nrpages);
7161         WARN_ON(BTRFS_I(inode)->outstanding_extents);
7162         WARN_ON(BTRFS_I(inode)->reserved_extents);
7163         WARN_ON(BTRFS_I(inode)->delalloc_bytes);
7164         WARN_ON(BTRFS_I(inode)->csum_bytes);
7165
7166         /*
7167          * This can happen where we create an inode, but somebody else also
7168          * created the same inode and we need to destroy the one we already
7169          * created.
7170          */
7171         if (!root)
7172                 goto free;
7173
7174         /*
7175          * Make sure we're properly removed from the ordered operation
7176          * lists.
7177          */
7178         smp_mb();
7179         if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
7180                 spin_lock(&root->fs_info->ordered_extent_lock);
7181                 list_del_init(&BTRFS_I(inode)->ordered_operations);
7182                 spin_unlock(&root->fs_info->ordered_extent_lock);
7183         }
7184
7185         if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
7186                      &BTRFS_I(inode)->runtime_flags)) {
7187                 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
7188                        (unsigned long long)btrfs_ino(inode));
7189                 atomic_dec(&root->orphan_inodes);
7190         }
7191
7192         while (1) {
7193                 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
7194                 if (!ordered)
7195                         break;
7196                 else {
7197                         printk(KERN_ERR "btrfs found ordered "
7198                                "extent %llu %llu on inode cleanup\n",
7199                                (unsigned long long)ordered->file_offset,
7200                                (unsigned long long)ordered->len);
7201                         btrfs_remove_ordered_extent(inode, ordered);
7202                         btrfs_put_ordered_extent(ordered);
7203                         btrfs_put_ordered_extent(ordered);
7204                 }
7205         }
7206         inode_tree_del(inode);
7207         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
7208 free:
7209         btrfs_remove_delayed_node(inode);
7210         call_rcu(&inode->i_rcu, btrfs_i_callback);
7211 }
7212
7213 int btrfs_drop_inode(struct inode *inode)
7214 {
7215         struct btrfs_root *root = BTRFS_I(inode)->root;
7216
7217         if (btrfs_root_refs(&root->root_item) == 0 &&
7218             !btrfs_is_free_space_inode(inode))
7219                 return 1;
7220         else
7221                 return generic_drop_inode(inode);
7222 }
7223
7224 static void init_once(void *foo)
7225 {
7226         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
7227
7228         inode_init_once(&ei->vfs_inode);
7229 }
7230
7231 void btrfs_destroy_cachep(void)
7232 {
7233         /*
7234          * Make sure all delayed rcu free inodes are flushed before we
7235          * destroy cache.
7236          */
7237         rcu_barrier();
7238         if (btrfs_inode_cachep)
7239                 kmem_cache_destroy(btrfs_inode_cachep);
7240         if (btrfs_trans_handle_cachep)
7241                 kmem_cache_destroy(btrfs_trans_handle_cachep);
7242         if (btrfs_transaction_cachep)
7243                 kmem_cache_destroy(btrfs_transaction_cachep);
7244         if (btrfs_path_cachep)
7245                 kmem_cache_destroy(btrfs_path_cachep);
7246         if (btrfs_free_space_cachep)
7247                 kmem_cache_destroy(btrfs_free_space_cachep);
7248         if (btrfs_delalloc_work_cachep)
7249                 kmem_cache_destroy(btrfs_delalloc_work_cachep);
7250 }
7251
7252 int btrfs_init_cachep(void)
7253 {
7254         btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
7255                         sizeof(struct btrfs_inode), 0,
7256                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
7257         if (!btrfs_inode_cachep)
7258                 goto fail;
7259
7260         btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
7261                         sizeof(struct btrfs_trans_handle), 0,
7262                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7263         if (!btrfs_trans_handle_cachep)
7264                 goto fail;
7265
7266         btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
7267                         sizeof(struct btrfs_transaction), 0,
7268                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7269         if (!btrfs_transaction_cachep)
7270                 goto fail;
7271
7272         btrfs_path_cachep = kmem_cache_create("btrfs_path",
7273                         sizeof(struct btrfs_path), 0,
7274                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7275         if (!btrfs_path_cachep)
7276                 goto fail;
7277
7278         btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
7279                         sizeof(struct btrfs_free_space), 0,
7280                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7281         if (!btrfs_free_space_cachep)
7282                 goto fail;
7283
7284         btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
7285                         sizeof(struct btrfs_delalloc_work), 0,
7286                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
7287                         NULL);
7288         if (!btrfs_delalloc_work_cachep)
7289                 goto fail;
7290
7291         return 0;
7292 fail:
7293         btrfs_destroy_cachep();
7294         return -ENOMEM;
7295 }
7296
7297 static int btrfs_getattr(struct vfsmount *mnt,
7298                          struct dentry *dentry, struct kstat *stat)
7299 {
7300         struct inode *inode = dentry->d_inode;
7301         u32 blocksize = inode->i_sb->s_blocksize;
7302
7303         generic_fillattr(inode, stat);
7304         stat->dev = BTRFS_I(inode)->root->anon_dev;
7305         stat->blksize = PAGE_CACHE_SIZE;
7306         stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
7307                 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
7308         return 0;
7309 }
7310
7311 /*
7312  * If a file is moved, it will inherit the cow and compression flags of the new
7313  * directory.
7314  */
7315 static void fixup_inode_flags(struct inode *dir, struct inode *inode)
7316 {
7317         struct btrfs_inode *b_dir = BTRFS_I(dir);
7318         struct btrfs_inode *b_inode = BTRFS_I(inode);
7319
7320         if (b_dir->flags & BTRFS_INODE_NODATACOW)
7321                 b_inode->flags |= BTRFS_INODE_NODATACOW;
7322         else
7323                 b_inode->flags &= ~BTRFS_INODE_NODATACOW;
7324
7325         if (b_dir->flags & BTRFS_INODE_COMPRESS) {
7326                 b_inode->flags |= BTRFS_INODE_COMPRESS;
7327                 b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
7328         } else {
7329                 b_inode->flags &= ~(BTRFS_INODE_COMPRESS |
7330                                     BTRFS_INODE_NOCOMPRESS);
7331         }
7332 }
7333
7334 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7335                            struct inode *new_dir, struct dentry *new_dentry)
7336 {
7337         struct btrfs_trans_handle *trans;
7338         struct btrfs_root *root = BTRFS_I(old_dir)->root;
7339         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
7340         struct inode *new_inode = new_dentry->d_inode;
7341         struct inode *old_inode = old_dentry->d_inode;
7342         struct timespec ctime = CURRENT_TIME;
7343         u64 index = 0;
7344         u64 root_objectid;
7345         int ret;
7346         u64 old_ino = btrfs_ino(old_inode);
7347
7348         if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
7349                 return -EPERM;
7350
7351         /* we only allow rename subvolume link between subvolumes */
7352         if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
7353                 return -EXDEV;
7354
7355         if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
7356             (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
7357                 return -ENOTEMPTY;
7358
7359         if (S_ISDIR(old_inode->i_mode) && new_inode &&
7360             new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
7361                 return -ENOTEMPTY;
7362         /*
7363          * we're using rename to replace one file with another.
7364          * and the replacement file is large.  Start IO on it now so
7365          * we don't add too much work to the end of the transaction
7366          */
7367         if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
7368             old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
7369                 filemap_flush(old_inode->i_mapping);
7370
7371         /* close the racy window with snapshot create/destroy ioctl */
7372         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
7373                 down_read(&root->fs_info->subvol_sem);
7374         /*
7375          * We want to reserve the absolute worst case amount of items.  So if
7376          * both inodes are subvols and we need to unlink them then that would
7377          * require 4 item modifications, but if they are both normal inodes it
7378          * would require 5 item modifications, so we'll assume their normal
7379          * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
7380          * should cover the worst case number of items we'll modify.
7381          */
7382         trans = btrfs_start_transaction(root, 20);
7383         if (IS_ERR(trans)) {
7384                 ret = PTR_ERR(trans);
7385                 goto out_notrans;
7386         }
7387
7388         if (dest != root)
7389                 btrfs_record_root_in_trans(trans, dest);
7390
7391         ret = btrfs_set_inode_index(new_dir, &index);
7392         if (ret)
7393                 goto out_fail;
7394
7395         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
7396                 /* force full log commit if subvolume involved. */
7397                 root->fs_info->last_trans_log_full_commit = trans->transid;
7398         } else {
7399                 ret = btrfs_insert_inode_ref(trans, dest,
7400                                              new_dentry->d_name.name,
7401                                              new_dentry->d_name.len,
7402                                              old_ino,
7403                                              btrfs_ino(new_dir), index);
7404                 if (ret)
7405                         goto out_fail;
7406                 /*
7407                  * this is an ugly little race, but the rename is required
7408                  * to make sure that if we crash, the inode is either at the
7409                  * old name or the new one.  pinning the log transaction lets
7410                  * us make sure we don't allow a log commit to come in after
7411                  * we unlink the name but before we add the new name back in.
7412                  */
7413                 btrfs_pin_log_trans(root);
7414         }
7415         /*
7416          * make sure the inode gets flushed if it is replacing
7417          * something.
7418          */
7419         if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
7420                 btrfs_add_ordered_operation(trans, root, old_inode);
7421
7422         inode_inc_iversion(old_dir);
7423         inode_inc_iversion(new_dir);
7424         inode_inc_iversion(old_inode);
7425         old_dir->i_ctime = old_dir->i_mtime = ctime;
7426         new_dir->i_ctime = new_dir->i_mtime = ctime;
7427         old_inode->i_ctime = ctime;
7428
7429         if (old_dentry->d_parent != new_dentry->d_parent)
7430                 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
7431
7432         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
7433                 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
7434                 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
7435                                         old_dentry->d_name.name,
7436                                         old_dentry->d_name.len);
7437         } else {
7438                 ret = __btrfs_unlink_inode(trans, root, old_dir,
7439                                         old_dentry->d_inode,
7440                                         old_dentry->d_name.name,
7441                                         old_dentry->d_name.len);
7442                 if (!ret)
7443                         ret = btrfs_update_inode(trans, root, old_inode);
7444         }
7445         if (ret) {
7446                 btrfs_abort_transaction(trans, root, ret);
7447                 goto out_fail;
7448         }
7449
7450         if (new_inode) {
7451                 inode_inc_iversion(new_inode);
7452                 new_inode->i_ctime = CURRENT_TIME;
7453                 if (unlikely(btrfs_ino(new_inode) ==
7454                              BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
7455                         root_objectid = BTRFS_I(new_inode)->location.objectid;
7456                         ret = btrfs_unlink_subvol(trans, dest, new_dir,
7457                                                 root_objectid,
7458                                                 new_dentry->d_name.name,
7459                                                 new_dentry->d_name.len);
7460                         BUG_ON(new_inode->i_nlink == 0);
7461                 } else {
7462                         ret = btrfs_unlink_inode(trans, dest, new_dir,
7463                                                  new_dentry->d_inode,
7464                                                  new_dentry->d_name.name,
7465                                                  new_dentry->d_name.len);
7466                 }
7467                 if (!ret && new_inode->i_nlink == 0) {
7468                         ret = btrfs_orphan_add(trans, new_dentry->d_inode);
7469                         BUG_ON(ret);
7470                 }
7471                 if (ret) {
7472                         btrfs_abort_transaction(trans, root, ret);
7473                         goto out_fail;
7474                 }
7475         }
7476
7477         fixup_inode_flags(new_dir, old_inode);
7478
7479         ret = btrfs_add_link(trans, new_dir, old_inode,
7480                              new_dentry->d_name.name,
7481                              new_dentry->d_name.len, 0, index);
7482         if (ret) {
7483                 btrfs_abort_transaction(trans, root, ret);
7484                 goto out_fail;
7485         }
7486
7487         if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
7488                 struct dentry *parent = new_dentry->d_parent;
7489                 btrfs_log_new_name(trans, old_inode, old_dir, parent);
7490                 btrfs_end_log_trans(root);
7491         }
7492 out_fail:
7493         btrfs_end_transaction(trans, root);
7494 out_notrans:
7495         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
7496                 up_read(&root->fs_info->subvol_sem);
7497
7498         return ret;
7499 }
7500
7501 static void btrfs_run_delalloc_work(struct btrfs_work *work)
7502 {
7503         struct btrfs_delalloc_work *delalloc_work;
7504
7505         delalloc_work = container_of(work, struct btrfs_delalloc_work,
7506                                      work);
7507         if (delalloc_work->wait)
7508                 btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
7509         else
7510                 filemap_flush(delalloc_work->inode->i_mapping);
7511
7512         if (delalloc_work->delay_iput)
7513                 btrfs_add_delayed_iput(delalloc_work->inode);
7514         else
7515                 iput(delalloc_work->inode);
7516         complete(&delalloc_work->completion);
7517 }
7518
7519 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
7520                                                     int wait, int delay_iput)
7521 {
7522         struct btrfs_delalloc_work *work;
7523
7524         work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
7525         if (!work)
7526                 return NULL;
7527
7528         init_completion(&work->completion);
7529         INIT_LIST_HEAD(&work->list);
7530         work->inode = inode;
7531         work->wait = wait;
7532         work->delay_iput = delay_iput;
7533         work->work.func = btrfs_run_delalloc_work;
7534
7535         return work;
7536 }
7537
7538 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
7539 {
7540         wait_for_completion(&work->completion);
7541         kmem_cache_free(btrfs_delalloc_work_cachep, work);
7542 }
7543
7544 /*
7545  * some fairly slow code that needs optimization. This walks the list
7546  * of all the inodes with pending delalloc and forces them to disk.
7547  */
7548 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7549 {
7550         struct list_head *head = &root->fs_info->delalloc_inodes;
7551         struct btrfs_inode *binode;
7552         struct inode *inode;
7553         struct btrfs_delalloc_work *work, *next;
7554         struct list_head works;
7555         int ret = 0;
7556
7557         if (root->fs_info->sb->s_flags & MS_RDONLY)
7558                 return -EROFS;
7559
7560         INIT_LIST_HEAD(&works);
7561
7562         spin_lock(&root->fs_info->delalloc_lock);
7563         while (!list_empty(head)) {
7564                 binode = list_entry(head->next, struct btrfs_inode,
7565                                     delalloc_inodes);
7566                 inode = igrab(&binode->vfs_inode);
7567                 if (!inode)
7568                         list_del_init(&binode->delalloc_inodes);
7569                 spin_unlock(&root->fs_info->delalloc_lock);
7570                 if (inode) {
7571                         work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
7572                         if (!work) {
7573                                 ret = -ENOMEM;
7574                                 goto out;
7575                         }
7576                         list_add_tail(&work->list, &works);
7577                         btrfs_queue_worker(&root->fs_info->flush_workers,
7578                                            &work->work);
7579                 }
7580                 cond_resched();
7581                 spin_lock(&root->fs_info->delalloc_lock);
7582         }
7583         spin_unlock(&root->fs_info->delalloc_lock);
7584
7585         /* the filemap_flush will queue IO into the worker threads, but
7586          * we have to make sure the IO is actually started and that
7587          * ordered extents get created before we return
7588          */
7589         atomic_inc(&root->fs_info->async_submit_draining);
7590         while (atomic_read(&root->fs_info->nr_async_submits) ||
7591               atomic_read(&root->fs_info->async_delalloc_pages)) {
7592                 wait_event(root->fs_info->async_submit_wait,
7593                    (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
7594                     atomic_read(&root->fs_info->async_delalloc_pages) == 0));
7595         }
7596         atomic_dec(&root->fs_info->async_submit_draining);
7597 out:
7598         list_for_each_entry_safe(work, next, &works, list) {
7599                 list_del_init(&work->list);
7600                 btrfs_wait_and_free_delalloc_work(work);
7601         }
7602         return ret;
7603 }
7604
7605 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7606                          const char *symname)
7607 {
7608         struct btrfs_trans_handle *trans;
7609         struct btrfs_root *root = BTRFS_I(dir)->root;
7610         struct btrfs_path *path;
7611         struct btrfs_key key;
7612         struct inode *inode = NULL;
7613         int err;
7614         int drop_inode = 0;
7615         u64 objectid;
7616         u64 index = 0 ;
7617         int name_len;
7618         int datasize;
7619         unsigned long ptr;
7620         struct btrfs_file_extent_item *ei;
7621         struct extent_buffer *leaf;
7622
7623         name_len = strlen(symname) + 1;
7624         if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
7625                 return -ENAMETOOLONG;
7626
7627         /*
7628          * 2 items for inode item and ref
7629          * 2 items for dir items
7630          * 1 item for xattr if selinux is on
7631          */
7632         trans = btrfs_start_transaction(root, 5);
7633         if (IS_ERR(trans))
7634                 return PTR_ERR(trans);
7635
7636         err = btrfs_find_free_ino(root, &objectid);
7637         if (err)
7638                 goto out_unlock;
7639
7640         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
7641                                 dentry->d_name.len, btrfs_ino(dir), objectid,
7642                                 S_IFLNK|S_IRWXUGO, &index);
7643         if (IS_ERR(inode)) {
7644                 err = PTR_ERR(inode);
7645                 goto out_unlock;
7646         }
7647
7648         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
7649         if (err) {
7650                 drop_inode = 1;
7651                 goto out_unlock;
7652         }
7653
7654         /*
7655         * If the active LSM wants to access the inode during
7656         * d_instantiate it needs these. Smack checks to see
7657         * if the filesystem supports xattrs by looking at the
7658         * ops vector.
7659         */
7660         inode->i_fop = &btrfs_file_operations;
7661         inode->i_op = &btrfs_file_inode_operations;
7662
7663         err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
7664         if (err)
7665                 drop_inode = 1;
7666         else {
7667                 inode->i_mapping->a_ops = &btrfs_aops;
7668                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
7669                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
7670         }
7671         if (drop_inode)
7672                 goto out_unlock;
7673
7674         path = btrfs_alloc_path();
7675         if (!path) {
7676                 err = -ENOMEM;
7677                 drop_inode = 1;
7678                 goto out_unlock;
7679         }
7680         key.objectid = btrfs_ino(inode);
7681         key.offset = 0;
7682         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
7683         datasize = btrfs_file_extent_calc_inline_size(name_len);
7684         err = btrfs_insert_empty_item(trans, root, path, &key,
7685                                       datasize);
7686         if (err) {
7687                 drop_inode = 1;
7688                 btrfs_free_path(path);
7689                 goto out_unlock;
7690         }
7691         leaf = path->nodes[0];
7692         ei = btrfs_item_ptr(leaf, path->slots[0],
7693                             struct btrfs_file_extent_item);
7694         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
7695         btrfs_set_file_extent_type(leaf, ei,
7696                                    BTRFS_FILE_EXTENT_INLINE);
7697         btrfs_set_file_extent_encryption(leaf, ei, 0);
7698         btrfs_set_file_extent_compression(leaf, ei, 0);
7699         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
7700         btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
7701
7702         ptr = btrfs_file_extent_inline_start(ei);
7703         write_extent_buffer(leaf, symname, ptr, name_len);
7704         btrfs_mark_buffer_dirty(leaf);
7705         btrfs_free_path(path);
7706
7707         inode->i_op = &btrfs_symlink_inode_operations;
7708         inode->i_mapping->a_ops = &btrfs_symlink_aops;
7709         inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
7710         inode_set_bytes(inode, name_len);
7711         btrfs_i_size_write(inode, name_len - 1);
7712         err = btrfs_update_inode(trans, root, inode);
7713         if (err)
7714                 drop_inode = 1;
7715
7716 out_unlock:
7717         if (!err)
7718                 d_instantiate(dentry, inode);
7719         btrfs_end_transaction(trans, root);
7720         if (drop_inode) {
7721                 inode_dec_link_count(inode);
7722                 iput(inode);
7723         }
7724         btrfs_btree_balance_dirty(root);
7725         return err;
7726 }
7727
7728 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7729                                        u64 start, u64 num_bytes, u64 min_size,
7730                                        loff_t actual_len, u64 *alloc_hint,
7731                                        struct btrfs_trans_handle *trans)
7732 {
7733         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
7734         struct extent_map *em;
7735         struct btrfs_root *root = BTRFS_I(inode)->root;
7736         struct btrfs_key ins;
7737         u64 cur_offset = start;
7738         u64 i_size;
7739         int ret = 0;
7740         bool own_trans = true;
7741
7742         if (trans)
7743                 own_trans = false;
7744         while (num_bytes > 0) {
7745                 if (own_trans) {
7746                         trans = btrfs_start_transaction(root, 3);
7747                         if (IS_ERR(trans)) {
7748                                 ret = PTR_ERR(trans);
7749                                 break;
7750                         }
7751                 }
7752
7753                 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
7754                                            0, *alloc_hint, &ins, 1);
7755                 if (ret) {
7756                         if (own_trans)
7757                                 btrfs_end_transaction(trans, root);
7758                         break;
7759                 }
7760
7761                 ret = insert_reserved_file_extent(trans, inode,
7762                                                   cur_offset, ins.objectid,
7763                                                   ins.offset, ins.offset,
7764                                                   ins.offset, 0, 0, 0,
7765                                                   BTRFS_FILE_EXTENT_PREALLOC);
7766                 if (ret) {
7767                         btrfs_abort_transaction(trans, root, ret);
7768                         if (own_trans)
7769                                 btrfs_end_transaction(trans, root);
7770                         break;
7771                 }
7772                 btrfs_drop_extent_cache(inode, cur_offset,
7773                                         cur_offset + ins.offset -1, 0);
7774
7775                 em = alloc_extent_map();
7776                 if (!em) {
7777                         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
7778                                 &BTRFS_I(inode)->runtime_flags);
7779                         goto next;
7780                 }
7781
7782                 em->start = cur_offset;
7783                 em->orig_start = cur_offset;
7784                 em->len = ins.offset;
7785                 em->block_start = ins.objectid;
7786                 em->block_len = ins.offset;
7787                 em->orig_block_len = ins.offset;
7788                 em->bdev = root->fs_info->fs_devices->latest_bdev;
7789                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7790                 em->generation = trans->transid;
7791
7792                 while (1) {
7793                         write_lock(&em_tree->lock);
7794                         ret = add_extent_mapping(em_tree, em);
7795                         if (!ret)
7796                                 list_move(&em->list,
7797                                           &em_tree->modified_extents);
7798                         write_unlock(&em_tree->lock);
7799                         if (ret != -EEXIST)
7800                                 break;
7801                         btrfs_drop_extent_cache(inode, cur_offset,
7802                                                 cur_offset + ins.offset - 1,
7803                                                 0);
7804                 }
7805                 free_extent_map(em);
7806 next:
7807                 num_bytes -= ins.offset;
7808                 cur_offset += ins.offset;
7809                 *alloc_hint = ins.objectid + ins.offset;
7810
7811                 inode_inc_iversion(inode);
7812                 inode->i_ctime = CURRENT_TIME;
7813                 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
7814                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
7815                     (actual_len > inode->i_size) &&
7816                     (cur_offset > inode->i_size)) {
7817                         if (cur_offset > actual_len)
7818                                 i_size = actual_len;
7819                         else
7820                                 i_size = cur_offset;
7821                         i_size_write(inode, i_size);
7822                         btrfs_ordered_update_i_size(inode, i_size, NULL);
7823                 }
7824
7825                 ret = btrfs_update_inode(trans, root, inode);
7826
7827                 if (ret) {
7828                         btrfs_abort_transaction(trans, root, ret);
7829                         if (own_trans)
7830                                 btrfs_end_transaction(trans, root);
7831                         break;
7832                 }
7833
7834                 if (own_trans)
7835                         btrfs_end_transaction(trans, root);
7836         }
7837         return ret;
7838 }
7839
7840 int btrfs_prealloc_file_range(struct inode *inode, int mode,
7841                               u64 start, u64 num_bytes, u64 min_size,
7842                               loff_t actual_len, u64 *alloc_hint)
7843 {
7844         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
7845                                            min_size, actual_len, alloc_hint,
7846                                            NULL);
7847 }
7848
7849 int btrfs_prealloc_file_range_trans(struct inode *inode,
7850                                     struct btrfs_trans_handle *trans, int mode,
7851                                     u64 start, u64 num_bytes, u64 min_size,
7852                                     loff_t actual_len, u64 *alloc_hint)
7853 {
7854         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
7855                                            min_size, actual_len, alloc_hint, trans);
7856 }
7857
7858 static int btrfs_set_page_dirty(struct page *page)
7859 {
7860         return __set_page_dirty_nobuffers(page);
7861 }
7862
7863 static int btrfs_permission(struct inode *inode, int mask)
7864 {
7865         struct btrfs_root *root = BTRFS_I(inode)->root;
7866         umode_t mode = inode->i_mode;
7867
7868         if (mask & MAY_WRITE &&
7869             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
7870                 if (btrfs_root_readonly(root))
7871                         return -EROFS;
7872                 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
7873                         return -EACCES;
7874         }
7875         return generic_permission(inode, mask);
7876 }
7877
7878 static const struct inode_operations btrfs_dir_inode_operations = {
7879         .getattr        = btrfs_getattr,
7880         .lookup         = btrfs_lookup,
7881         .create         = btrfs_create,
7882         .unlink         = btrfs_unlink,
7883         .link           = btrfs_link,
7884         .mkdir          = btrfs_mkdir,
7885         .rmdir          = btrfs_rmdir,
7886         .rename         = btrfs_rename,
7887         .symlink        = btrfs_symlink,
7888         .setattr        = btrfs_setattr,
7889         .mknod          = btrfs_mknod,
7890         .setxattr       = btrfs_setxattr,
7891         .getxattr       = btrfs_getxattr,
7892         .listxattr      = btrfs_listxattr,
7893         .removexattr    = btrfs_removexattr,
7894         .permission     = btrfs_permission,
7895         .get_acl        = btrfs_get_acl,
7896 };
7897 static const struct inode_operations btrfs_dir_ro_inode_operations = {
7898         .lookup         = btrfs_lookup,
7899         .permission     = btrfs_permission,
7900         .get_acl        = btrfs_get_acl,
7901 };
7902
7903 static const struct file_operations btrfs_dir_file_operations = {
7904         .llseek         = generic_file_llseek,
7905         .read           = generic_read_dir,
7906         .readdir        = btrfs_real_readdir,
7907         .unlocked_ioctl = btrfs_ioctl,
7908 #ifdef CONFIG_COMPAT
7909         .compat_ioctl   = btrfs_ioctl,
7910 #endif
7911         .release        = btrfs_release_file,
7912         .fsync          = btrfs_sync_file,
7913 };
7914
7915 static struct extent_io_ops btrfs_extent_io_ops = {
7916         .fill_delalloc = run_delalloc_range,
7917         .submit_bio_hook = btrfs_submit_bio_hook,
7918         .merge_bio_hook = btrfs_merge_bio_hook,
7919         .readpage_end_io_hook = btrfs_readpage_end_io_hook,
7920         .writepage_end_io_hook = btrfs_writepage_end_io_hook,
7921         .writepage_start_hook = btrfs_writepage_start_hook,
7922         .set_bit_hook = btrfs_set_bit_hook,
7923         .clear_bit_hook = btrfs_clear_bit_hook,
7924         .merge_extent_hook = btrfs_merge_extent_hook,
7925         .split_extent_hook = btrfs_split_extent_hook,
7926 };
7927
7928 /*
7929  * btrfs doesn't support the bmap operation because swapfiles
7930  * use bmap to make a mapping of extents in the file.  They assume
7931  * these extents won't change over the life of the file and they
7932  * use the bmap result to do IO directly to the drive.
7933  *
7934  * the btrfs bmap call would return logical addresses that aren't
7935  * suitable for IO and they also will change frequently as COW
7936  * operations happen.  So, swapfile + btrfs == corruption.
7937  *
7938  * For now we're avoiding this by dropping bmap.
7939  */
7940 static const struct address_space_operations btrfs_aops = {
7941         .readpage       = btrfs_readpage,
7942         .writepage      = btrfs_writepage,
7943         .writepages     = btrfs_writepages,
7944         .readpages      = btrfs_readpages,
7945         .direct_IO      = btrfs_direct_IO,
7946         .invalidatepage = btrfs_invalidatepage,
7947         .releasepage    = btrfs_releasepage,
7948         .set_page_dirty = btrfs_set_page_dirty,
7949         .error_remove_page = generic_error_remove_page,
7950 };
7951
7952 static const struct address_space_operations btrfs_symlink_aops = {
7953         .readpage       = btrfs_readpage,
7954         .writepage      = btrfs_writepage,
7955         .invalidatepage = btrfs_invalidatepage,
7956         .releasepage    = btrfs_releasepage,
7957 };
7958
7959 static const struct inode_operations btrfs_file_inode_operations = {
7960         .getattr        = btrfs_getattr,
7961         .setattr        = btrfs_setattr,
7962         .setxattr       = btrfs_setxattr,
7963         .getxattr       = btrfs_getxattr,
7964         .listxattr      = btrfs_listxattr,
7965         .removexattr    = btrfs_removexattr,
7966         .permission     = btrfs_permission,
7967         .fiemap         = btrfs_fiemap,
7968         .get_acl        = btrfs_get_acl,
7969         .update_time    = btrfs_update_time,
7970 };
7971 static const struct inode_operations btrfs_special_inode_operations = {
7972         .getattr        = btrfs_getattr,
7973         .setattr        = btrfs_setattr,
7974         .permission     = btrfs_permission,
7975         .setxattr       = btrfs_setxattr,
7976         .getxattr       = btrfs_getxattr,
7977         .listxattr      = btrfs_listxattr,
7978         .removexattr    = btrfs_removexattr,
7979         .get_acl        = btrfs_get_acl,
7980         .update_time    = btrfs_update_time,
7981 };
7982 static const struct inode_operations btrfs_symlink_inode_operations = {
7983         .readlink       = generic_readlink,
7984         .follow_link    = page_follow_link_light,
7985         .put_link       = page_put_link,
7986         .getattr        = btrfs_getattr,
7987         .setattr        = btrfs_setattr,
7988         .permission     = btrfs_permission,
7989         .setxattr       = btrfs_setxattr,
7990         .getxattr       = btrfs_getxattr,
7991         .listxattr      = btrfs_listxattr,
7992         .removexattr    = btrfs_removexattr,
7993         .get_acl        = btrfs_get_acl,
7994         .update_time    = btrfs_update_time,
7995 };
7996
7997 const struct dentry_operations btrfs_dentry_operations = {
7998         .d_delete       = btrfs_dentry_delete,
7999         .d_release      = btrfs_dentry_release,
8000 };