Btrfs: Add fallocate support v2
[pandora-kernel.git] / fs / btrfs / file.c
index 1b7e51a..238a8e2 100644 (file)
@@ -41,6 +41,9 @@
 #include "compat.h"
 
 
+/* simple helper to fault in pages and copy.  This should go away
+ * and be replaced with calls into generic code.
+ */
 static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
                                         int write_bytes,
                                         struct page **prepared_pages,
@@ -72,12 +75,19 @@ static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
        return page_fault ? -EFAULT : 0;
 }
 
+/*
+ * unlocks pages after btrfs_file_write is done with them
+ */
 static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
        size_t i;
        for (i = 0; i < num_pages; i++) {
                if (!pages[i])
                        break;
+               /* page checked is some magic around finding pages that
+                * have been modified without going through btrfs_set_page_dirty
+                * clear it here
+                */
                ClearPageChecked(pages[i]);
                unlock_page(pages[i]);
                mark_page_accessed(pages[i]);
@@ -85,149 +95,14 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
        }
 }
 
-static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root, struct inode *inode,
-                               u64 offset, size_t size,
-                               struct page **pages, size_t page_offset,
-                               int num_pages)
-{
-       struct btrfs_key key;
-       struct btrfs_path *path;
-       struct extent_buffer *leaf;
-       char *kaddr;
-       unsigned long ptr;
-       struct btrfs_file_extent_item *ei;
-       struct page *page;
-       u32 datasize;
-       int err = 0;
-       int ret;
-       int i;
-       ssize_t cur_size;
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-       btrfs_set_trans_block_group(trans, inode);
-
-       key.objectid = inode->i_ino;
-       key.offset = offset;
-       btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-
-       ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-       if (ret < 0) {
-               err = ret;
-               goto fail;
-       }
-       if (ret == 1) {
-               struct btrfs_key found_key;
-
-               if (path->slots[0] == 0)
-                       goto insert;
-
-               path->slots[0]--;
-               leaf = path->nodes[0];
-               btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
-               if (found_key.objectid != inode->i_ino)
-                       goto insert;
-
-               if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-                       goto insert;
-               ei = btrfs_item_ptr(leaf, path->slots[0],
-                                   struct btrfs_file_extent_item);
-
-               if (btrfs_file_extent_type(leaf, ei) !=
-                   BTRFS_FILE_EXTENT_INLINE) {
-                       goto insert;
-               }
-               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-               ret = 0;
-       }
-       if (ret == 0) {
-               u32 found_size;
-               u64 found_end;
-
-               leaf = path->nodes[0];
-               ei = btrfs_item_ptr(leaf, path->slots[0],
-                                   struct btrfs_file_extent_item);
-
-               if (btrfs_file_extent_type(leaf, ei) !=
-                   BTRFS_FILE_EXTENT_INLINE) {
-                       err = ret;
-                       btrfs_print_leaf(root, leaf);
-                       printk("found wasn't inline offset %Lu inode %lu\n",
-                              offset, inode->i_ino);
-                       goto fail;
-               }
-               found_size = btrfs_file_extent_inline_len(leaf,
-                                         btrfs_item_nr(leaf, path->slots[0]));
-               found_end = key.offset + found_size;
-
-               if (found_end < offset + size) {
-                       btrfs_release_path(root, path);
-                       ret = btrfs_search_slot(trans, root, &key, path,
-                                               offset + size - found_end, 1);
-                       BUG_ON(ret != 0);
-
-                       ret = btrfs_extend_item(trans, root, path,
-                                               offset + size - found_end);
-                       if (ret) {
-                               err = ret;
-                               goto fail;
-                       }
-                       leaf = path->nodes[0];
-                       ei = btrfs_item_ptr(leaf, path->slots[0],
-                                           struct btrfs_file_extent_item);
-                       inode->i_blocks += (offset + size - found_end) >> 9;
-               }
-               if (found_end < offset) {
-                       ptr = btrfs_file_extent_inline_start(ei) + found_size;
-                       memset_extent_buffer(leaf, 0, ptr, offset - found_end);
-               }
-       } else {
-insert:
-               btrfs_release_path(root, path);
-               datasize = offset + size - key.offset;
-               inode->i_blocks += datasize >> 9;
-               datasize = btrfs_file_extent_calc_inline_size(datasize);
-               ret = btrfs_insert_empty_item(trans, root, path, &key,
-                                             datasize);
-               if (ret) {
-                       err = ret;
-                       printk("got bad ret %d\n", ret);
-                       goto fail;
-               }
-               leaf = path->nodes[0];
-               ei = btrfs_item_ptr(leaf, path->slots[0],
-                                   struct btrfs_file_extent_item);
-               btrfs_set_file_extent_generation(leaf, ei, trans->transid);
-               btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
-       }
-       ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
-
-       cur_size = size;
-       i = 0;
-       while (size > 0) {
-               page = pages[i];
-               kaddr = kmap_atomic(page, KM_USER0);
-               cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
-               write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
-               kunmap_atomic(kaddr, KM_USER0);
-               page_offset = 0;
-               ptr += cur_size;
-               size -= cur_size;
-               if (i >= num_pages) {
-                       printk("i %d num_pages %d\n", i, num_pages);
-               }
-               i++;
-       }
-       btrfs_mark_buffer_dirty(leaf);
-fail:
-       btrfs_free_path(path);
-       return err;
-}
-
+/*
+ * after copy_from_user, pages need to be dirtied and we need to make
+ * sure holes are created between the current EOF and the start of
+ * any next extents (if required).
+ *
+ * this also makes the decision about creating an inline extent vs
+ * doing real data extents, marking pages dirty and delalloc as required.
+ */
 static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct file *file,
@@ -245,8 +120,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
        u64 start_pos;
        u64 end_of_last_block;
        u64 end_pos = pos + write_bytes;
-       u64 inline_size;
-       int did_inline = 0;
        loff_t isize = i_size_read(inode);
 
        start_pos = pos & ~((u64)root->sectorsize - 1);
@@ -269,99 +142,31 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
        }
        set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 
-       /* FIXME...EIEIO, ENOSPC and more */
-       /* insert any holes we need to create */
-       if (isize < start_pos) {
-               u64 last_pos_in_file;
-               u64 hole_size;
-               u64 mask = root->sectorsize - 1;
-               last_pos_in_file = (isize + mask) & ~mask;
-               hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
-               if (hole_size > 0) {
-                       btrfs_wait_ordered_range(inode, last_pos_in_file,
-                                                last_pos_in_file + hole_size);
-                       mutex_lock(&BTRFS_I(inode)->extent_mutex);
-                       err = btrfs_drop_extents(trans, root, inode,
-                                                last_pos_in_file,
-                                                last_pos_in_file + hole_size,
-                                                last_pos_in_file,
-                                                &hint_byte);
-                       if (err)
-                               goto failed;
-
-                       err = btrfs_insert_file_extent(trans, root,
-                                                      inode->i_ino,
-                                                      last_pos_in_file,
-                                                      0, 0, hole_size, 0);
-                       btrfs_drop_extent_cache(inode, last_pos_in_file,
-                                       last_pos_in_file + hole_size - 1, 0);
-                       mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-                       btrfs_check_file(root, inode);
-               }
-               if (err)
-                       goto failed;
-       }
-
-       /*
-        * either allocate an extent for the new bytes or setup the key
-        * to show we are doing inline data in the extent
+       /* check for reserved extents on each page, we don't want
+        * to reset the delalloc bit on things that already have
+        * extents reserved.
         */
-       inline_size = end_pos;
-       if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
-           inline_size > root->fs_info->max_inline ||
-           (inline_size & (root->sectorsize -1)) == 0 ||
-           inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-               /* check for reserved extents on each page, we don't want
-                * to reset the delalloc bit on things that already have
-                * extents reserved.
-                */
-               btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
-               for (i = 0; i < num_pages; i++) {
-                       struct page *p = pages[i];
-                       SetPageUptodate(p);
-                       ClearPageChecked(p);
-                       set_page_dirty(p);
-               }
-       } else {
-               u64 aligned_end;
-               /* step one, delete the existing extents in this range */
-               aligned_end = (pos + write_bytes + root->sectorsize - 1) &
-                       ~((u64)root->sectorsize - 1);
-               mutex_lock(&BTRFS_I(inode)->extent_mutex);
-               err = btrfs_drop_extents(trans, root, inode, start_pos,
-                                        aligned_end, aligned_end, &hint_byte);
-               if (err)
-                       goto failed;
-               if (isize > inline_size)
-                       inline_size = min_t(u64, isize, aligned_end);
-               inline_size -= start_pos;
-               err = insert_inline_extent(trans, root, inode, start_pos,
-                                          inline_size, pages, 0, num_pages);
-               btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
-               BUG_ON(err);
-               mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-
-               /*
-                * an ugly way to do all the prop accounting around
-                * the page bits and mapping tags
-                */
-               set_page_writeback(pages[0]);
-               end_page_writeback(pages[0]);
-               did_inline = 1;
+       btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = pages[i];
+               SetPageUptodate(p);
+               ClearPageChecked(p);
+               set_page_dirty(p);
        }
        if (end_pos > isize) {
                i_size_write(inode, end_pos);
-               if (did_inline)
-                       BTRFS_I(inode)->disk_i_size = end_pos;
                btrfs_update_inode(trans, root, inode);
        }
-failed:
        err = btrfs_end_transaction(trans, root);
 out_unlock:
        unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
        return err;
 }
 
+/*
+ * this drops all the extents in the cache that intersect the range
+ * [start, end].  Existing extents are split as required.
+ */
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                            int skip_pinned)
 {
@@ -373,6 +178,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
        int ret;
        int testend = 1;
        unsigned long flags;
+       int compressed = 0;
 
        WARN_ON(end < start);
        if (end == (u64)-1) {
@@ -408,6 +214,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        free_extent_map(em);
                        continue;
                }
+               compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
                remove_extent_mapping(em_tree, em);
 
@@ -416,6 +223,12 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->start = em->start;
                        split->len = start - em->start;
                        split->block_start = em->block_start;
+
+                       if (compressed)
+                               split->block_len = em->block_len;
+                       else
+                               split->block_len = split->len;
+
                        split->bdev = em->bdev;
                        split->flags = flags;
                        ret = add_extent_mapping(em_tree, split);
@@ -433,7 +246,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->bdev = em->bdev;
                        split->flags = flags;
 
-                       split->block_start = em->block_start + diff;
+                       if (compressed) {
+                               split->block_len = em->block_len;
+                               split->block_start = em->block_start;
+                       } else {
+                               split->block_len = split->len;
+                               split->block_start = em->block_start + diff;
+                       }
 
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret);
@@ -507,7 +326,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
                        struct btrfs_item *item;
                        item = btrfs_item_nr(leaf, slot);
                        extent_end = found_key.offset +
-                            btrfs_file_extent_inline_len(leaf, item);
+                            btrfs_file_extent_inline_len(leaf, extent);
                        extent_end = (extent_end + root->sectorsize - 1) &
                                ~((u64)root->sectorsize -1 );
                }
@@ -536,14 +355,22 @@ out:
  * If an extent intersects the range but is not entirely inside the range
  * it is either truncated or split.  Anything entirely inside the range
  * is deleted from the tree.
+ *
+ * inline_limit is used to tell this code which offsets in the file to keep
+ * if they contain inline extents.
  */
 int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct inode *inode,
                       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
 {
        u64 extent_end = 0;
+       u64 locked_end = end;
        u64 search_start = start;
        u64 leaf_start;
+       u64 ram_bytes = 0;
+       u8 compression = 0;
+       u8 encryption = 0;
+       u16 other_encoding = 0;
        u64 root_gen;
        u64 root_owner;
        struct extent_buffer *leaf;
@@ -554,12 +381,13 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
        int keep;
        int slot;
        int bookend;
-       int found_type;
+       int found_type = 0;
        int found_extent;
        int found_inline;
        int recow;
        int ret;
 
+       inline_limit = 0;
        btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
        path = btrfs_alloc_path();
@@ -608,7 +436,14 @@ next_slot:
                        extent = btrfs_item_ptr(leaf, slot,
                                                struct btrfs_file_extent_item);
                        found_type = btrfs_file_extent_type(leaf, extent);
-                       if (found_type == BTRFS_FILE_EXTENT_REG) {
+                       compression = btrfs_file_extent_compression(leaf,
+                                                                   extent);
+                       encryption = btrfs_file_extent_encryption(leaf,
+                                                                 extent);
+                       other_encoding = btrfs_file_extent_other_encoding(leaf,
+                                                                 extent);
+                       if (found_type == BTRFS_FILE_EXTENT_REG ||
+                           found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                                extent_end =
                                     btrfs_file_extent_disk_bytenr(leaf,
                                                                   extent);
@@ -617,13 +452,13 @@ next_slot:
 
                                extent_end = key.offset +
                                     btrfs_file_extent_num_bytes(leaf, extent);
+                               ram_bytes = btrfs_file_extent_ram_bytes(leaf,
+                                                               extent);
                                found_extent = 1;
                        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-                               struct btrfs_item *item;
-                               item = btrfs_item_nr(leaf, slot);
                                found_inline = 1;
                                extent_end = key.offset +
-                                    btrfs_file_extent_inline_len(leaf, item);
+                                    btrfs_file_extent_inline_len(leaf, extent);
                        }
                } else {
                        extent_end = search_start;
@@ -646,15 +481,8 @@ next_slot:
                        goto next_slot;
                }
 
-               if (found_inline) {
-                       u64 mask = root->sectorsize - 1;
-                       search_start = (extent_end + mask) & ~mask;
-               } else
-                       search_start = extent_end;
-               if (end <= extent_end && start >= key.offset && found_inline) {
+               if (end <= extent_end && start >= key.offset && found_inline)
                        *hint_byte = EXTENT_MAP_INLINE;
-                       goto out;
-               }
 
                if (found_extent) {
                        read_extent_buffer(leaf, &old, (unsigned long)extent,
@@ -669,6 +497,26 @@ next_slot:
                        if (found_inline && start <= key.offset)
                                keep = 1;
                }
+
+               if (bookend && found_extent && locked_end < extent_end) {
+                       ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+                                       locked_end, extent_end - 1, GFP_NOFS);
+                       if (!ret) {
+                               btrfs_release_path(root, path);
+                               lock_extent(&BTRFS_I(inode)->io_tree,
+                                       locked_end, extent_end - 1, GFP_NOFS);
+                               locked_end = extent_end;
+                               continue;
+                       }
+                       locked_end = extent_end;
+               }
+
+               if (found_inline) {
+                       u64 mask = root->sectorsize - 1;
+                       search_start = (extent_end + mask) & ~mask;
+               } else
+                       search_start = extent_end;
+
                /* truncate existing extent */
                if (start > key.offset) {
                        u64 new_num;
@@ -684,7 +532,8 @@ next_slot:
                                                                      extent);
                                if (btrfs_file_extent_disk_bytenr(leaf,
                                                                  extent)) {
-                                       dec_i_blocks(inode, old_num - new_num);
+                                       inode_sub_bytes(inode, old_num -
+                                                       new_num);
                                }
                                btrfs_set_file_extent_num_bytes(leaf, extent,
                                                                new_num);
@@ -695,14 +544,17 @@ next_slot:
                                u32 new_size;
                                new_size = btrfs_file_extent_calc_inline_size(
                                                   inline_limit - key.offset);
-                               dec_i_blocks(inode, (extent_end - key.offset) -
-                                       (inline_limit - key.offset));
+                               inode_sub_bytes(inode, extent_end -
+                                               inline_limit);
                                btrfs_truncate_item(trans, root, path,
                                                    new_size, 1);
                        }
                }
                /* delete the entire extent */
                if (!keep) {
+                       if (found_inline)
+                               inode_sub_bytes(inode, extent_end -
+                                               key.offset);
                        ret = btrfs_del_item(trans, root, path);
                        /* TODO update progress marker and return */
                        BUG_ON(ret);
@@ -714,8 +566,7 @@ next_slot:
                        u32 new_size;
                        new_size = btrfs_file_extent_calc_inline_size(
                                                   extent_end - end);
-                       dec_i_blocks(inode, (extent_end - key.offset) -
-                                       (extent_end - end));
+                       inode_sub_bytes(inode, end - key.offset);
                        ret = btrfs_truncate_item(trans, root, path,
                                                  new_size, 0);
                        BUG_ON(ret);
@@ -738,14 +589,28 @@ next_slot:
                        write_extent_buffer(leaf, &old,
                                            (unsigned long)extent, sizeof(old));
 
+                       btrfs_set_file_extent_compression(leaf, extent,
+                                                         compression);
+                       btrfs_set_file_extent_encryption(leaf, extent,
+                                                        encryption);
+                       btrfs_set_file_extent_other_encoding(leaf, extent,
+                                                            other_encoding);
                        btrfs_set_file_extent_offset(leaf, extent,
                                    le64_to_cpu(old.offset) + end - key.offset);
                        WARN_ON(le64_to_cpu(old.num_bytes) <
                                (extent_end - end));
                        btrfs_set_file_extent_num_bytes(leaf, extent,
                                                        extent_end - end);
-                       btrfs_set_file_extent_type(leaf, extent,
-                                                  BTRFS_FILE_EXTENT_REG);
+
+                       /*
+                        * set the ram bytes to the size of the full extent
+                        * before splitting.  This is a worst case flag,
+                        * but its the best we can do because we don't know
+                        * how splitting affects compression
+                        */
+                       btrfs_set_file_extent_ram_bytes(leaf, extent,
+                                                       ram_bytes);
+                       btrfs_set_file_extent_type(leaf, extent, found_type);
 
                        btrfs_mark_buffer_dirty(path->nodes[0]);
 
@@ -756,15 +621,12 @@ next_slot:
                                                le64_to_cpu(old.disk_num_bytes),
                                                leaf->start,
                                                root->root_key.objectid,
-                                               trans->transid,
-                                               ins.objectid, ins.offset);
+                                               trans->transid, ins.objectid);
                                BUG_ON(ret);
                        }
                        btrfs_release_path(root, path);
                        if (disk_bytenr != 0) {
-                               inode->i_blocks +=
-                                     btrfs_file_extent_num_bytes(leaf,
-                                                                 extent) >> 9;
+                               inode_add_bytes(inode, extent_end - end);
                        }
                }
 
@@ -772,13 +634,13 @@ next_slot:
                        u64 disk_bytenr = le64_to_cpu(old.disk_bytenr);
 
                        if (disk_bytenr != 0) {
-                               dec_i_blocks(inode, le64_to_cpu(old.num_bytes));
+                               inode_sub_bytes(inode,
+                                               le64_to_cpu(old.num_bytes));
                                ret = btrfs_free_extent(trans, root,
                                                disk_bytenr,
                                                le64_to_cpu(old.disk_num_bytes),
                                                leaf_start, root_owner,
-                                               root_gen, key.objectid,
-                                               key.offset, 0);
+                                               root_gen, key.objectid, 0);
                                BUG_ON(ret);
                                *hint_byte = disk_bytenr;
                        }
@@ -791,12 +653,255 @@ next_slot:
        }
 out:
        btrfs_free_path(path);
+       if (locked_end > end) {
+               unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+                             GFP_NOFS);
+       }
        btrfs_check_file(root, inode);
        return ret;
 }
 
+static int extent_mergeable(struct extent_buffer *leaf, int slot,
+                           u64 objectid, u64 bytenr, u64 *start, u64 *end)
+{
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
+       u64 extent_end;
+
+       if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+               return 0;
+
+       btrfs_item_key_to_cpu(leaf, &key, slot);
+       if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
+               return 0;
+
+       fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+       if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+           btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+           btrfs_file_extent_compression(leaf, fi) ||
+           btrfs_file_extent_encryption(leaf, fi) ||
+           btrfs_file_extent_other_encoding(leaf, fi))
+               return 0;
+
+       extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+       if ((*start && *start != key.offset) || (*end && *end != extent_end))
+               return 0;
+
+       *start = key.offset;
+       *end = extent_end;
+       return 1;
+}
+
+/*
+ * Mark extent in the range start - end as written.
+ *
+ * This changes extent type from 'pre-allocated' to 'regular'. If only
+ * part of extent is marked as written, the extent will be split into
+ * two or three.
+ */
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct inode *inode, u64 start, u64 end)
+{
+       struct extent_buffer *leaf;
+       struct btrfs_path *path;
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
+       u64 bytenr;
+       u64 num_bytes;
+       u64 extent_end;
+       u64 extent_offset;
+       u64 other_start;
+       u64 other_end;
+       u64 split = start;
+       u64 locked_end = end;
+       int extent_type;
+       int split_end = 1;
+       int ret;
+
+       btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+again:
+       key.objectid = inode->i_ino;
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       if (split == start)
+               key.offset = split;
+       else
+               key.offset = split - 1;
+
+       ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+       if (ret > 0 && path->slots[0] > 0)
+               path->slots[0]--;
+
+       leaf = path->nodes[0];
+       btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+       BUG_ON(key.objectid != inode->i_ino ||
+              key.type != BTRFS_EXTENT_DATA_KEY);
+       fi = btrfs_item_ptr(leaf, path->slots[0],
+                           struct btrfs_file_extent_item);
+       extent_type = btrfs_file_extent_type(leaf, fi);
+       BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
+       extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+       BUG_ON(key.offset > start || extent_end < end);
+
+       bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+       num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+       extent_offset = btrfs_file_extent_offset(leaf, fi);
+
+       if (key.offset == start)
+               split = end;
+
+       if (key.offset == start && extent_end == end) {
+               int del_nr = 0;
+               int del_slot = 0;
+               u64 leaf_owner = btrfs_header_owner(leaf);
+               u64 leaf_gen = btrfs_header_generation(leaf);
+               other_start = end;
+               other_end = 0;
+               if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+                                    bytenr, &other_start, &other_end)) {
+                       extent_end = other_end;
+                       del_slot = path->slots[0] + 1;
+                       del_nr++;
+                       ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+                                               leaf->start, leaf_owner,
+                                               leaf_gen, inode->i_ino, 0);
+                       BUG_ON(ret);
+               }
+               other_start = 0;
+               other_end = start;
+               if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
+                                    bytenr, &other_start, &other_end)) {
+                       key.offset = other_start;
+                       del_slot = path->slots[0];
+                       del_nr++;
+                       ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+                                               leaf->start, leaf_owner,
+                                               leaf_gen, inode->i_ino, 0);
+                       BUG_ON(ret);
+               }
+               split_end = 0;
+               if (del_nr == 0) {
+                       btrfs_set_file_extent_type(leaf, fi,
+                                                  BTRFS_FILE_EXTENT_REG);
+                       goto done;
+               }
+
+               fi = btrfs_item_ptr(leaf, del_slot - 1,
+                                   struct btrfs_file_extent_item);
+               btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
+               btrfs_set_file_extent_num_bytes(leaf, fi,
+                                               extent_end - key.offset);
+               btrfs_mark_buffer_dirty(leaf);
+
+               ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+               BUG_ON(ret);
+               goto done;
+       } else if (split == start) {
+               if (locked_end < extent_end) {
+                       ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+                                       locked_end, extent_end - 1, GFP_NOFS);
+                       if (!ret) {
+                               btrfs_release_path(root, path);
+                               lock_extent(&BTRFS_I(inode)->io_tree,
+                                       locked_end, extent_end - 1, GFP_NOFS);
+                               locked_end = extent_end;
+                               goto again;
+                       }
+                       locked_end = extent_end;
+               }
+               btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
+               extent_offset += split - key.offset;
+       } else  {
+               BUG_ON(key.offset != start);
+               btrfs_set_file_extent_offset(leaf, fi, extent_offset +
+                                            split - key.offset);
+               btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
+               key.offset = split;
+               btrfs_set_item_key_safe(trans, root, path, &key);
+               extent_end = split;
+       }
+
+       if (extent_end == end) {
+               split_end = 0;
+               extent_type = BTRFS_FILE_EXTENT_REG;
+       }
+       if (extent_end == end && split == start) {
+               other_start = end;
+               other_end = 0;
+               if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+                                    bytenr, &other_start, &other_end)) {
+                       path->slots[0]++;
+                       fi = btrfs_item_ptr(leaf, path->slots[0],
+                                           struct btrfs_file_extent_item);
+                       key.offset = split;
+                       btrfs_set_item_key_safe(trans, root, path, &key);
+                       btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+                       btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                       other_end - split);
+                       goto done;
+               }
+       }
+       if (extent_end == end && split == end) {
+               other_start = 0;
+               other_end = start;
+               if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
+                                    bytenr, &other_start, &other_end)) {
+                       path->slots[0]--;
+                       fi = btrfs_item_ptr(leaf, path->slots[0],
+                                           struct btrfs_file_extent_item);
+                       btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
+                                                       other_start);
+                       goto done;
+               }
+       }
+
+       btrfs_mark_buffer_dirty(leaf);
+       btrfs_release_path(root, path);
+
+       key.offset = start;
+       ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
+       BUG_ON(ret);
+
+       leaf = path->nodes[0];
+       fi = btrfs_item_ptr(leaf, path->slots[0],
+                           struct btrfs_file_extent_item);
+       btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+       btrfs_set_file_extent_type(leaf, fi, extent_type);
+       btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
+       btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
+       btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+       btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
+       btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+       btrfs_set_file_extent_compression(leaf, fi, 0);
+       btrfs_set_file_extent_encryption(leaf, fi, 0);
+       btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+
+       ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+                                  leaf->start, root->root_key.objectid,
+                                  trans->transid, inode->i_ino);
+       BUG_ON(ret);
+done:
+       btrfs_mark_buffer_dirty(leaf);
+       btrfs_release_path(root, path);
+       if (split_end && split == start) {
+               split = end;
+               goto again;
+       }
+       if (locked_end > end) {
+               unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+                             GFP_NOFS);
+       }
+       btrfs_free_path(path);
+       return 0;
+}
+
 /*
- * this gets pages into the page cache and locks them down
+ * this gets pages into the page cache and locks them down, it also properly
+ * waits for data=ordered extents to finish before allowing the pages to be
+ * modified.
  */
 static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
                         struct page **pages, size_t num_pages,
@@ -813,6 +918,12 @@ static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
        start_pos = pos & ~((u64)root->sectorsize - 1);
        last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
 
+       if (start_pos > inode->i_size) {
+               err = btrfs_cont_expand(inode, start_pos);
+               if (err)
+                       return err;
+       }
+
        memset(pages, 0, num_pages * sizeof(struct page *));
 again:
        for (i = 0; i < num_pages; i++) {
@@ -874,6 +985,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
        struct page *pinned[2];
        unsigned long first_index;
        unsigned long last_index;
+       int will_write;
+
+       will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
+                     (file->f_flags & O_DIRECT));
 
        nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
                     PAGE_CACHE_SIZE / (sizeof(struct page *)));
@@ -970,15 +1085,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                if (ret)
                        goto out;
 
+               if (will_write) {
+                       btrfs_fdatawrite_range(inode->i_mapping, pos,
+                                              pos + write_bytes - 1,
+                                              WB_SYNC_NONE);
+               } else {
+                       balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                                                          num_pages);
+                       if (num_pages <
+                           (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+                               btrfs_btree_balance_dirty(root, 1);
+                       btrfs_throttle(root);
+               }
+
                buf += write_bytes;
                count -= write_bytes;
                pos += write_bytes;
                num_written += write_bytes;
 
-               balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
-               if (num_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-                       btrfs_btree_balance_dirty(root, 1);
-               btrfs_throttle(root);
                cond_resched();
        }
 out:
@@ -992,36 +1116,29 @@ out_nolock:
                page_cache_release(pinned[1]);
        *ppos = pos;
 
-       if (num_written > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+       if (num_written > 0 && will_write) {
                struct btrfs_trans_handle *trans;
 
-               err = btrfs_fdatawrite_range(inode->i_mapping, start_pos,
-                                            start_pos + num_written -1,
-                                            WB_SYNC_NONE);
-               if (err < 0)
-                       num_written = err;
-
-               err = btrfs_wait_on_page_writeback_range(inode->i_mapping,
-                                start_pos, start_pos + num_written - 1);
-               if (err < 0)
+               err = btrfs_wait_ordered_range(inode, start_pos, num_written);
+               if (err)
                        num_written = err;
 
-               trans = btrfs_start_transaction(root, 1);
-               ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
-               if (ret == 0) {
-                       btrfs_sync_log(trans, root);
-                       btrfs_end_transaction(trans, root);
-               } else {
-                       btrfs_commit_transaction(trans, root);
+               if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+                       trans = btrfs_start_transaction(root, 1);
+                       ret = btrfs_log_dentry_safe(trans, root,
+                                                   file->f_dentry);
+                       if (ret == 0) {
+                               btrfs_sync_log(trans, root);
+                               btrfs_end_transaction(trans, root);
+                       } else {
+                               btrfs_commit_transaction(trans, root);
+                       }
+               }
+               if (file->f_flags & O_DIRECT) {
+                       invalidate_mapping_pages(inode->i_mapping,
+                             start_pos >> PAGE_CACHE_SHIFT,
+                            (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
                }
-       } else if (num_written > 0 && (file->f_flags & O_DIRECT)) {
-               do_sync_mapping_range(inode->i_mapping, start_pos,
-                                     start_pos + num_written - 1,
-                                     SYNC_FILE_RANGE_WRITE |
-                                     SYNC_FILE_RANGE_WAIT_AFTER);
-               invalidate_mapping_pages(inode->i_mapping,
-                     start_pos >> PAGE_CACHE_SHIFT,
-                    (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
        }
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
@@ -1034,6 +1151,17 @@ int btrfs_release_file(struct inode * inode, struct file * filp)
        return 0;
 }
 
+/*
+ * fsync call for both files and directories.  This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+ *
+ * It needs to call filemap_fdatawait so that all ordered extent updates are
+ * in the metadata btree are up to date for copying to the log.
+ *
+ * It drops the inode mutex before doing the tree log commit.  This is an
+ * important optimization for directories because holding the mutex prevents
+ * new operations on the dir while we write to disk.
+ */
 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
        struct inode *inode = dentry->d_inode;