Merge branch 'nfs-for-3.1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 31 Jul 2011 16:26:50 +0000 (06:26 -1000)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 31 Jul 2011 16:26:50 +0000 (06:26 -1000)
* 'nfs-for-3.1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (28 commits)
  pnfsblock: write_pagelist handle zero invalid extents
  pnfsblock: note written INVAL areas for layoutcommit
  pnfsblock: bl_write_pagelist
  pnfsblock: bl_read_pagelist
  pnfsblock: cleanup_layoutcommit
  pnfsblock: encode_layoutcommit
  pnfsblock: merge rw extents
  pnfsblock: add extent manipulation functions
  pnfsblock: bl_find_get_extent
  pnfsblock: xdr decode pnfs_block_layout4
  pnfsblock: call and parse getdevicelist
  pnfsblock: merge extents
  pnfsblock: lseg alloc and free
  pnfsblock: remove device operations
  pnfsblock: add device operations
  pnfsblock: basic extent code
  pnfsblock: use pageio_ops api
  pnfsblock: add blocklayout Kconfig option, Makefile, and stubs
  pnfs: cleanup_layoutcommit
  pnfs: ask for layout_blksize and save it in nfs_server
  ...

21 files changed:
fs/nfs/Kconfig
fs/nfs/Makefile
fs/nfs/blocklayout/Makefile [new file with mode: 0644]
fs/nfs/blocklayout/blocklayout.c [new file with mode: 0644]
fs/nfs/blocklayout/blocklayout.h [new file with mode: 0644]
fs/nfs/blocklayout/blocklayoutdev.c [new file with mode: 0644]
fs/nfs/blocklayout/blocklayoutdm.c [new file with mode: 0644]
fs/nfs/blocklayout/extents.c [new file with mode: 0644]
fs/nfs/client.c
fs/nfs/dir.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4filelayout.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4xdr.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
include/linux/nfs.h
include/linux/nfs4.h
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_xdr.h

index 2cde5d9..be02077 100644 (file)
@@ -79,15 +79,21 @@ config NFS_V4_1
        depends on NFS_FS && NFS_V4 && EXPERIMENTAL
        select SUNRPC_BACKCHANNEL
        select PNFS_FILE_LAYOUT
+       select PNFS_BLOCK
+       select MD
+       select BLK_DEV_DM
        help
          This option enables support for minor version 1 of the NFSv4 protocol
-         (RFC 5661) in the kernel's NFS client.
+         (RFC 5661 and RFC 5663) in the kernel's NFS client.
 
          If unsure, say N.
 
 config PNFS_FILE_LAYOUT
        tristate
 
+config PNFS_BLOCK
+       tristate
+
 config PNFS_OBJLAYOUT
        tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
        depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
index 6a34f7d..b58613d 100644 (file)
@@ -23,3 +23,4 @@ obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
 nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
 
 obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
+obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
new file mode 100644 (file)
index 0000000..d581550
--- /dev/null
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS block layout driver kernel module
+#
+obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
+blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
new file mode 100644 (file)
index 0000000..e56564d
--- /dev/null
@@ -0,0 +1,1019 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayout.c
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/bio.h>         /* struct bio */
+#include <linux/buffer_head.h> /* various write calls */
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY        NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
+MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
+
+struct dentry *bl_device_pipe;
+wait_queue_head_t bl_wq;
+
+static void print_page(struct page *page)
+{
+       dprintk("PRINTPAGE page %p\n", page);
+       dprintk("       PagePrivate %d\n", PagePrivate(page));
+       dprintk("       PageUptodate %d\n", PageUptodate(page));
+       dprintk("       PageError %d\n", PageError(page));
+       dprintk("       PageDirty %d\n", PageDirty(page));
+       dprintk("       PageReferenced %d\n", PageReferenced(page));
+       dprintk("       PageLocked %d\n", PageLocked(page));
+       dprintk("       PageWriteback %d\n", PageWriteback(page));
+       dprintk("       PageMappedToDisk %d\n", PageMappedToDisk(page));
+       dprintk("\n");
+}
+
+/* Given the be associated with isect, determine if page data needs to be
+ * initialized.
+ */
+static int is_hole(struct pnfs_block_extent *be, sector_t isect)
+{
+       if (be->be_state == PNFS_BLOCK_NONE_DATA)
+               return 1;
+       else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
+               return 0;
+       else
+               return !bl_is_sector_init(be->be_inval, isect);
+}
+
+/* Given the be associated with isect, determine if page data can be
+ * written to disk.
+ */
+static int is_writable(struct pnfs_block_extent *be, sector_t isect)
+{
+       return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+               be->be_state == PNFS_BLOCK_INVALID_DATA);
+}
+
+/* The data we are handed might be spread across several bios.  We need
+ * to track when the last one is finished.
+ */
+struct parallel_io {
+       struct kref refcnt;
+       struct rpc_call_ops call_ops;
+       void (*pnfs_callback) (void *data);
+       void *data;
+};
+
+static inline struct parallel_io *alloc_parallel(void *data)
+{
+       struct parallel_io *rv;
+
+       rv  = kmalloc(sizeof(*rv), GFP_NOFS);
+       if (rv) {
+               rv->data = data;
+               kref_init(&rv->refcnt);
+       }
+       return rv;
+}
+
+static inline void get_parallel(struct parallel_io *p)
+{
+       kref_get(&p->refcnt);
+}
+
+static void destroy_parallel(struct kref *kref)
+{
+       struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
+
+       dprintk("%s enter\n", __func__);
+       p->pnfs_callback(p->data);
+       kfree(p);
+}
+
+static inline void put_parallel(struct parallel_io *p)
+{
+       kref_put(&p->refcnt, destroy_parallel);
+}
+
+static struct bio *
+bl_submit_bio(int rw, struct bio *bio)
+{
+       if (bio) {
+               get_parallel(bio->bi_private);
+               dprintk("%s submitting %s bio %u@%llu\n", __func__,
+                       rw == READ ? "read" : "write",
+                       bio->bi_size, (unsigned long long)bio->bi_sector);
+               submit_bio(rw, bio);
+       }
+       return NULL;
+}
+
+static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
+                                    struct pnfs_block_extent *be,
+                                    void (*end_io)(struct bio *, int err),
+                                    struct parallel_io *par)
+{
+       struct bio *bio;
+
+       bio = bio_alloc(GFP_NOIO, npg);
+       if (!bio)
+               return NULL;
+
+       bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+       bio->bi_bdev = be->be_mdev;
+       bio->bi_end_io = end_io;
+       bio->bi_private = par;
+       return bio;
+}
+
+static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+                                     sector_t isect, struct page *page,
+                                     struct pnfs_block_extent *be,
+                                     void (*end_io)(struct bio *, int err),
+                                     struct parallel_io *par)
+{
+retry:
+       if (!bio) {
+               bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
+               if (!bio)
+                       return ERR_PTR(-ENOMEM);
+       }
+       if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+               bio = bl_submit_bio(rw, bio);
+               goto retry;
+       }
+       return bio;
+}
+
+static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+       if (lseg->pls_range.iomode == IOMODE_RW) {
+               dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+               set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+       } else {
+               dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+               set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+       }
+}
+
+/* This is basically copied from mpage_end_io_read */
+static void bl_end_io_read(struct bio *bio, int err)
+{
+       struct parallel_io *par = bio->bi_private;
+       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+       struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+       struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
+
+       do {
+               struct page *page = bvec->bv_page;
+
+               if (--bvec >= bio->bi_io_vec)
+                       prefetchw(&bvec->bv_page->flags);
+               if (uptodate)
+                       SetPageUptodate(page);
+       } while (bvec >= bio->bi_io_vec);
+       if (!uptodate) {
+               if (!rdata->pnfs_error)
+                       rdata->pnfs_error = -EIO;
+               bl_set_lo_fail(rdata->lseg);
+       }
+       bio_put(bio);
+       put_parallel(par);
+}
+
+static void bl_read_cleanup(struct work_struct *work)
+{
+       struct rpc_task *task;
+       struct nfs_read_data *rdata;
+       dprintk("%s enter\n", __func__);
+       task = container_of(work, struct rpc_task, u.tk_work);
+       rdata = container_of(task, struct nfs_read_data, task);
+       pnfs_ld_read_done(rdata);
+}
+
+static void
+bl_end_par_io_read(void *data)
+{
+       struct nfs_read_data *rdata = data;
+
+       INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
+       schedule_work(&rdata->task.u.tk_work);
+}
+
+/* We don't want normal .rpc_call_done callback used, so we replace it
+ * with this stub.
+ */
+static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
+{
+       return;
+}
+
+static enum pnfs_try_status
+bl_read_pagelist(struct nfs_read_data *rdata)
+{
+       int i, hole;
+       struct bio *bio = NULL;
+       struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+       sector_t isect, extent_length = 0;
+       struct parallel_io *par;
+       loff_t f_offset = rdata->args.offset;
+       size_t count = rdata->args.count;
+       struct page **pages = rdata->args.pages;
+       int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+
+       dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
+              rdata->npages, f_offset, count);
+
+       par = alloc_parallel(rdata);
+       if (!par)
+               goto use_mds;
+       par->call_ops = *rdata->mds_ops;
+       par->call_ops.rpc_call_done = bl_rpc_do_nothing;
+       par->pnfs_callback = bl_end_par_io_read;
+       /* At this point, we can no longer jump to use_mds */
+
+       isect = (sector_t) (f_offset >> SECTOR_SHIFT);
+       /* Code assumes extents are page-aligned */
+       for (i = pg_index; i < rdata->npages; i++) {
+               if (!extent_length) {
+                       /* We've used up the previous extent */
+                       bl_put_extent(be);
+                       bl_put_extent(cow_read);
+                       bio = bl_submit_bio(READ, bio);
+                       /* Get the next one */
+                       be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
+                                            isect, &cow_read);
+                       if (!be) {
+                               rdata->pnfs_error = -EIO;
+                               goto out;
+                       }
+                       extent_length = be->be_length -
+                               (isect - be->be_f_offset);
+                       if (cow_read) {
+                               sector_t cow_length = cow_read->be_length -
+                                       (isect - cow_read->be_f_offset);
+                               extent_length = min(extent_length, cow_length);
+                       }
+               }
+               hole = is_hole(be, isect);
+               if (hole && !cow_read) {
+                       bio = bl_submit_bio(READ, bio);
+                       /* Fill hole w/ zeroes w/o accessing device */
+                       dprintk("%s Zeroing page for hole\n", __func__);
+                       zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+                       print_page(pages[i]);
+                       SetPageUptodate(pages[i]);
+               } else {
+                       struct pnfs_block_extent *be_read;
+
+                       be_read = (hole && cow_read) ? cow_read : be;
+                       bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
+                                                isect, pages[i], be_read,
+                                                bl_end_io_read, par);
+                       if (IS_ERR(bio)) {
+                               rdata->pnfs_error = PTR_ERR(bio);
+                               goto out;
+                       }
+               }
+               isect += PAGE_CACHE_SECTORS;
+               extent_length -= PAGE_CACHE_SECTORS;
+       }
+       if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
+               rdata->res.eof = 1;
+               rdata->res.count = rdata->inode->i_size - f_offset;
+       } else {
+               rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
+       }
+out:
+       bl_put_extent(be);
+       bl_put_extent(cow_read);
+       bl_submit_bio(READ, bio);
+       put_parallel(par);
+       return PNFS_ATTEMPTED;
+
+ use_mds:
+       dprintk("Giving up and using normal NFS\n");
+       return PNFS_NOT_ATTEMPTED;
+}
+
+static void mark_extents_written(struct pnfs_block_layout *bl,
+                                __u64 offset, __u32 count)
+{
+       sector_t isect, end;
+       struct pnfs_block_extent *be;
+
+       dprintk("%s(%llu, %u)\n", __func__, offset, count);
+       if (count == 0)
+               return;
+       isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
+       end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
+       end >>= SECTOR_SHIFT;
+       while (isect < end) {
+               sector_t len;
+               be = bl_find_get_extent(bl, isect, NULL);
+               BUG_ON(!be); /* FIXME */
+               len = min(end, be->be_f_offset + be->be_length) - isect;
+               if (be->be_state == PNFS_BLOCK_INVALID_DATA)
+                       bl_mark_for_commit(be, isect, len); /* What if fails? */
+               isect += len;
+               bl_put_extent(be);
+       }
+}
+
+static void bl_end_io_write_zero(struct bio *bio, int err)
+{
+       struct parallel_io *par = bio->bi_private;
+       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+       struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+       struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
+
+       do {
+               struct page *page = bvec->bv_page;
+
+               if (--bvec >= bio->bi_io_vec)
+                       prefetchw(&bvec->bv_page->flags);
+               /* This is the zeroing page we added */
+               end_page_writeback(page);
+               page_cache_release(page);
+       } while (bvec >= bio->bi_io_vec);
+       if (!uptodate) {
+               if (!wdata->pnfs_error)
+                       wdata->pnfs_error = -EIO;
+               bl_set_lo_fail(wdata->lseg);
+       }
+       bio_put(bio);
+       put_parallel(par);
+}
+
+/* This is basically copied from mpage_end_io_read */
+static void bl_end_io_write(struct bio *bio, int err)
+{
+       struct parallel_io *par = bio->bi_private;
+       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+       struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
+
+       if (!uptodate) {
+               if (!wdata->pnfs_error)
+                       wdata->pnfs_error = -EIO;
+               bl_set_lo_fail(wdata->lseg);
+       }
+       bio_put(bio);
+       put_parallel(par);
+}
+
+/* Function scheduled for call during bl_end_par_io_write,
+ * it marks sectors as written and extends the commitlist.
+ */
+static void bl_write_cleanup(struct work_struct *work)
+{
+       struct rpc_task *task;
+       struct nfs_write_data *wdata;
+       dprintk("%s enter\n", __func__);
+       task = container_of(work, struct rpc_task, u.tk_work);
+       wdata = container_of(task, struct nfs_write_data, task);
+       if (!wdata->pnfs_error) {
+               /* Marks for LAYOUTCOMMIT */
+               mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+                                    wdata->args.offset, wdata->args.count);
+       }
+       pnfs_ld_write_done(wdata);
+}
+
+/* Called when last of bios associated with a bl_write_pagelist call finishes */
+static void bl_end_par_io_write(void *data)
+{
+       struct nfs_write_data *wdata = data;
+
+       wdata->task.tk_status = 0;
+       wdata->verf.committed = NFS_FILE_SYNC;
+       INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
+       schedule_work(&wdata->task.u.tk_work);
+}
+
+/* FIXME STUB - mark intersection of layout and page as bad, so is not
+ * used again.
+ */
+static void mark_bad_read(void)
+{
+       return;
+}
+
+/*
+ * map_block:  map a requested I/0 block (isect) into an offset in the LVM
+ * block_device
+ */
+static void
+map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
+{
+       dprintk("%s enter be=%p\n", __func__, be);
+
+       set_buffer_mapped(bh);
+       bh->b_bdev = be->be_mdev;
+       bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
+           (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
+
+       dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
+               __func__, (unsigned long long)isect, (long)bh->b_blocknr,
+               bh->b_size);
+       return;
+}
+
+/* Given an unmapped page, zero it or read in page for COW, page is locked
+ * by caller.
+ */
+static int
+init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
+{
+       struct buffer_head *bh = NULL;
+       int ret = 0;
+       sector_t isect;
+
+       dprintk("%s enter, %p\n", __func__, page);
+       BUG_ON(PageUptodate(page));
+       if (!cow_read) {
+               zero_user_segment(page, 0, PAGE_SIZE);
+               SetPageUptodate(page);
+               goto cleanup;
+       }
+
+       bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
+       if (!bh) {
+               ret = -ENOMEM;
+               goto cleanup;
+       }
+
+       isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
+       map_block(bh, isect, cow_read);
+       if (!bh_uptodate_or_lock(bh))
+               ret = bh_submit_read(bh);
+       if (ret)
+               goto cleanup;
+       SetPageUptodate(page);
+
+cleanup:
+       bl_put_extent(cow_read);
+       if (bh)
+               free_buffer_head(bh);
+       if (ret) {
+               /* Need to mark layout with bad read...should now
+                * just use nfs4 for reads and writes.
+                */
+               mark_bad_read();
+       }
+       return ret;
+}
+
+static enum pnfs_try_status
+bl_write_pagelist(struct nfs_write_data *wdata, int sync)
+{
+       int i, ret, npg_zero, pg_index, last = 0;
+       struct bio *bio = NULL;
+       struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+       sector_t isect, last_isect = 0, extent_length = 0;
+       struct parallel_io *par;
+       loff_t offset = wdata->args.offset;
+       size_t count = wdata->args.count;
+       struct page **pages = wdata->args.pages;
+       struct page *page;
+       pgoff_t index;
+       u64 temp;
+       int npg_per_block =
+           NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
+
+       dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
+       /* At this point, wdata->pages is a (sequential) list of nfs_pages.
+        * We want to write each, and if there is an error set pnfs_error
+        * to have it redone using nfs.
+        */
+       par = alloc_parallel(wdata);
+       if (!par)
+               return PNFS_NOT_ATTEMPTED;
+       par->call_ops = *wdata->mds_ops;
+       par->call_ops.rpc_call_done = bl_rpc_do_nothing;
+       par->pnfs_callback = bl_end_par_io_write;
+       /* At this point, have to be more careful with error handling */
+
+       isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
+       be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
+       if (!be || !is_writable(be, isect)) {
+               dprintk("%s no matching extents!\n", __func__);
+               wdata->pnfs_error = -EINVAL;
+               goto out;
+       }
+
+       /* First page inside INVALID extent */
+       if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+               temp = offset >> PAGE_CACHE_SHIFT;
+               npg_zero = do_div(temp, npg_per_block);
+               isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
+                                    (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
+               extent_length = be->be_length - (isect - be->be_f_offset);
+
+fill_invalid_ext:
+               dprintk("%s need to zero %d pages\n", __func__, npg_zero);
+               for (;npg_zero > 0; npg_zero--) {
+                       /* page ref released in bl_end_io_write_zero */
+                       index = isect >> PAGE_CACHE_SECTOR_SHIFT;
+                       dprintk("%s zero %dth page: index %lu isect %llu\n",
+                               __func__, npg_zero, index,
+                               (unsigned long long)isect);
+                       page =
+                           find_or_create_page(wdata->inode->i_mapping, index,
+                                               GFP_NOFS);
+                       if (!page) {
+                               dprintk("%s oom\n", __func__);
+                               wdata->pnfs_error = -ENOMEM;
+                               goto out;
+                       }
+
+                       /* PageDirty: Other will write this out
+                        * PageWriteback: Other is writing this out
+                        * PageUptodate: It was read before
+                        * sector_initialized: already written out
+                        */
+                       if (PageDirty(page) || PageWriteback(page) ||
+                           bl_is_sector_init(be->be_inval, isect)) {
+                               print_page(page);
+                               unlock_page(page);
+                               page_cache_release(page);
+                               goto next_page;
+                       }
+                       if (!PageUptodate(page)) {
+                               /* New page, readin or zero it */
+                               init_page_for_write(page, cow_read);
+                       }
+                       set_page_writeback(page);
+                       unlock_page(page);
+
+                       ret = bl_mark_sectors_init(be->be_inval, isect,
+                                                      PAGE_CACHE_SECTORS,
+                                                      NULL);
+                       if (unlikely(ret)) {
+                               dprintk("%s bl_mark_sectors_init fail %d\n",
+                                       __func__, ret);
+                               end_page_writeback(page);
+                               page_cache_release(page);
+                               wdata->pnfs_error = ret;
+                               goto out;
+                       }
+                       bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
+                                                isect, page, be,
+                                                bl_end_io_write_zero, par);
+                       if (IS_ERR(bio)) {
+                               wdata->pnfs_error = PTR_ERR(bio);
+                               goto out;
+                       }
+                       /* FIXME: This should be done in bi_end_io */
+                       mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+                                            page->index << PAGE_CACHE_SHIFT,
+                                            PAGE_CACHE_SIZE);
+next_page:
+                       isect += PAGE_CACHE_SECTORS;
+                       extent_length -= PAGE_CACHE_SECTORS;
+               }
+               if (last)
+                       goto write_done;
+       }
+       bio = bl_submit_bio(WRITE, bio);
+
+       /* Middle pages */
+       pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
+       for (i = pg_index; i < wdata->npages; i++) {
+               if (!extent_length) {
+                       /* We've used up the previous extent */
+                       bl_put_extent(be);
+                       bio = bl_submit_bio(WRITE, bio);
+                       /* Get the next one */
+                       be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
+                                            isect, NULL);
+                       if (!be || !is_writable(be, isect)) {
+                               wdata->pnfs_error = -EINVAL;
+                               goto out;
+                       }
+                       extent_length = be->be_length -
+                           (isect - be->be_f_offset);
+               }
+               if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                       ret = bl_mark_sectors_init(be->be_inval, isect,
+                                                      PAGE_CACHE_SECTORS,
+                                                      NULL);
+                       if (unlikely(ret)) {
+                               dprintk("%s bl_mark_sectors_init fail %d\n",
+                                       __func__, ret);
+                               wdata->pnfs_error = ret;
+                               goto out;
+                       }
+               }
+               bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
+                                        isect, pages[i], be,
+                                        bl_end_io_write, par);
+               if (IS_ERR(bio)) {
+                       wdata->pnfs_error = PTR_ERR(bio);
+                       goto out;
+               }
+               isect += PAGE_CACHE_SECTORS;
+               last_isect = isect;
+               extent_length -= PAGE_CACHE_SECTORS;
+       }
+
+       /* Last page inside INVALID extent */
+       if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+               bio = bl_submit_bio(WRITE, bio);
+               temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
+               npg_zero = npg_per_block - do_div(temp, npg_per_block);
+               if (npg_zero < npg_per_block) {
+                       last = 1;
+                       goto fill_invalid_ext;
+               }
+       }
+
+write_done:
+       wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
+       if (count < wdata->res.count) {
+               wdata->res.count = count;
+       }
+out:
+       bl_put_extent(be);
+       bl_submit_bio(WRITE, bio);
+       put_parallel(par);
+       return PNFS_ATTEMPTED;
+}
+
+/* FIXME - range ignored */
+static void
+release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
+{
+       int i;
+       struct pnfs_block_extent *be;
+
+       spin_lock(&bl->bl_ext_lock);
+       for (i = 0; i < EXTENT_LISTS; i++) {
+               while (!list_empty(&bl->bl_extents[i])) {
+                       be = list_first_entry(&bl->bl_extents[i],
+                                             struct pnfs_block_extent,
+                                             be_node);
+                       list_del(&be->be_node);
+                       bl_put_extent(be);
+               }
+       }
+       spin_unlock(&bl->bl_ext_lock);
+}
+
+static void
+release_inval_marks(struct pnfs_inval_markings *marks)
+{
+       struct pnfs_inval_tracking *pos, *temp;
+
+       list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
+               list_del(&pos->it_link);
+               kfree(pos);
+       }
+       return;
+}
+
+static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+       struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+
+       dprintk("%s enter\n", __func__);
+       release_extents(bl, NULL);
+       release_inval_marks(&bl->bl_inval);
+       kfree(bl);
+}
+
+static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+                                                  gfp_t gfp_flags)
+{
+       struct pnfs_block_layout *bl;
+
+       dprintk("%s enter\n", __func__);
+       bl = kzalloc(sizeof(*bl), gfp_flags);
+       if (!bl)
+               return NULL;
+       spin_lock_init(&bl->bl_ext_lock);
+       INIT_LIST_HEAD(&bl->bl_extents[0]);
+       INIT_LIST_HEAD(&bl->bl_extents[1]);
+       INIT_LIST_HEAD(&bl->bl_commit);
+       INIT_LIST_HEAD(&bl->bl_committing);
+       bl->bl_count = 0;
+       bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
+       BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
+       return &bl->bl_layout;
+}
+
+static void bl_free_lseg(struct pnfs_layout_segment *lseg)
+{
+       dprintk("%s enter\n", __func__);
+       kfree(lseg);
+}
+
+/* We pretty much ignore lseg, and store all data layout wide, so we
+ * can correctly merge.
+ */
+static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
+                                                struct nfs4_layoutget_res *lgr,
+                                                gfp_t gfp_flags)
+{
+       struct pnfs_layout_segment *lseg;
+       int status;
+
+       dprintk("%s enter\n", __func__);
+       lseg = kzalloc(sizeof(*lseg), gfp_flags);
+       if (!lseg)
+               return ERR_PTR(-ENOMEM);
+       status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
+       if (status) {
+               /* We don't want to call the full-blown bl_free_lseg,
+                * since on error extents were not touched.
+                */
+               kfree(lseg);
+               return ERR_PTR(status);
+       }
+       return lseg;
+}
+
+static void
+bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
+                      const struct nfs4_layoutcommit_args *arg)
+{
+       dprintk("%s enter\n", __func__);
+       encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
+}
+
+static void
+bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+{
+       struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
+
+       dprintk("%s enter\n", __func__);
+       clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
+}
+
+static void free_blk_mountid(struct block_mount_id *mid)
+{
+       if (mid) {
+               struct pnfs_block_dev *dev;
+               spin_lock(&mid->bm_lock);
+               while (!list_empty(&mid->bm_devlist)) {
+                       dev = list_first_entry(&mid->bm_devlist,
+                                              struct pnfs_block_dev,
+                                              bm_node);
+                       list_del(&dev->bm_node);
+                       bl_free_block_dev(dev);
+               }
+               spin_unlock(&mid->bm_lock);
+               kfree(mid);
+       }
+}
+
+/* This is mostly copied from the filelayout's get_device_info function.
+ * It seems much of this should be at the generic pnfs level.
+ */
+static struct pnfs_block_dev *
+nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
+                       struct nfs4_deviceid *d_id)
+{
+       struct pnfs_device *dev;
+       struct pnfs_block_dev *rv = NULL;
+       u32 max_resp_sz;
+       int max_pages;
+       struct page **pages = NULL;
+       int i, rc;
+
+       /*
+        * Use the session max response size as the basis for setting
+        * GETDEVICEINFO's maxcount
+        */
+       max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+       max_pages = max_resp_sz >> PAGE_SHIFT;
+       dprintk("%s max_resp_sz %u max_pages %d\n",
+               __func__, max_resp_sz, max_pages);
+
+       dev = kmalloc(sizeof(*dev), GFP_NOFS);
+       if (!dev) {
+               dprintk("%s kmalloc failed\n", __func__);
+               return NULL;
+       }
+
+       pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
+       if (pages == NULL) {
+               kfree(dev);
+               return NULL;
+       }
+       for (i = 0; i < max_pages; i++) {
+               pages[i] = alloc_page(GFP_NOFS);
+               if (!pages[i])
+                       goto out_free;
+       }
+
+       memcpy(&dev->dev_id, d_id, sizeof(*d_id));
+       dev->layout_type = LAYOUT_BLOCK_VOLUME;
+       dev->pages = pages;
+       dev->pgbase = 0;
+       dev->pglen = PAGE_SIZE * max_pages;
+       dev->mincount = 0;
+
+       dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
+       rc = nfs4_proc_getdeviceinfo(server, dev);
+       dprintk("%s getdevice info returns %d\n", __func__, rc);
+       if (rc)
+               goto out_free;
+
+       rv = nfs4_blk_decode_device(server, dev);
+ out_free:
+       for (i = 0; i < max_pages; i++)
+               __free_page(pages[i]);
+       kfree(pages);
+       kfree(dev);
+       return rv;
+}
+
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
+       struct block_mount_id *b_mt_id = NULL;
+       struct pnfs_devicelist *dlist = NULL;
+       struct pnfs_block_dev *bdev;
+       LIST_HEAD(block_disklist);
+       int status = 0, i;
+
+       dprintk("%s enter\n", __func__);
+
+       if (server->pnfs_blksize == 0) {
+               dprintk("%s Server did not return blksize\n", __func__);
+               return -EINVAL;
+       }
+       b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
+       if (!b_mt_id) {
+               status = -ENOMEM;
+               goto out_error;
+       }
+       /* Initialize nfs4 block layout mount id */
+       spin_lock_init(&b_mt_id->bm_lock);
+       INIT_LIST_HEAD(&b_mt_id->bm_devlist);
+
+       dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
+       if (!dlist) {
+               status = -ENOMEM;
+               goto out_error;
+       }
+       dlist->eof = 0;
+       while (!dlist->eof) {
+               status = nfs4_proc_getdevicelist(server, fh, dlist);
+               if (status)
+                       goto out_error;
+               dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
+                       __func__, dlist->num_devs, dlist->eof);
+               for (i = 0; i < dlist->num_devs; i++) {
+                       bdev = nfs4_blk_get_deviceinfo(server, fh,
+                                                      &dlist->dev_id[i]);
+                       if (!bdev) {
+                               status = -ENODEV;
+                               goto out_error;
+                       }
+                       spin_lock(&b_mt_id->bm_lock);
+                       list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
+                       spin_unlock(&b_mt_id->bm_lock);
+               }
+       }
+       dprintk("%s SUCCESS\n", __func__);
+       server->pnfs_ld_data = b_mt_id;
+
+ out_return:
+       kfree(dlist);
+       return status;
+
+ out_error:
+       free_blk_mountid(b_mt_id);
+       goto out_return;
+}
+
+static int
+bl_clear_layoutdriver(struct nfs_server *server)
+{
+       struct block_mount_id *b_mt_id = server->pnfs_ld_data;
+
+       dprintk("%s enter\n", __func__);
+       free_blk_mountid(b_mt_id);
+       dprintk("%s RETURNS\n", __func__);
+       return 0;
+}
+
+static const struct nfs_pageio_ops bl_pg_read_ops = {
+       .pg_init = pnfs_generic_pg_init_read,
+       .pg_test = pnfs_generic_pg_test,
+       .pg_doio = pnfs_generic_pg_readpages,
+};
+
+static const struct nfs_pageio_ops bl_pg_write_ops = {
+       .pg_init = pnfs_generic_pg_init_write,
+       .pg_test = pnfs_generic_pg_test,
+       .pg_doio = pnfs_generic_pg_writepages,
+};
+
+static struct pnfs_layoutdriver_type blocklayout_type = {
+       .id                             = LAYOUT_BLOCK_VOLUME,
+       .name                           = "LAYOUT_BLOCK_VOLUME",
+       .read_pagelist                  = bl_read_pagelist,
+       .write_pagelist                 = bl_write_pagelist,
+       .alloc_layout_hdr               = bl_alloc_layout_hdr,
+       .free_layout_hdr                = bl_free_layout_hdr,
+       .alloc_lseg                     = bl_alloc_lseg,
+       .free_lseg                      = bl_free_lseg,
+       .encode_layoutcommit            = bl_encode_layoutcommit,
+       .cleanup_layoutcommit           = bl_cleanup_layoutcommit,
+       .set_layoutdriver               = bl_set_layoutdriver,
+       .clear_layoutdriver             = bl_clear_layoutdriver,
+       .pg_read_ops                    = &bl_pg_read_ops,
+       .pg_write_ops                   = &bl_pg_write_ops,
+};
+
+static const struct rpc_pipe_ops bl_upcall_ops = {
+       .upcall         = bl_pipe_upcall,
+       .downcall       = bl_pipe_downcall,
+       .destroy_msg    = bl_pipe_destroy_msg,
+};
+
+static int __init nfs4blocklayout_init(void)
+{
+       struct vfsmount *mnt;
+       struct path path;
+       int ret;
+
+       dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
+
+       ret = pnfs_register_layoutdriver(&blocklayout_type);
+       if (ret)
+               goto out;
+
+       init_waitqueue_head(&bl_wq);
+
+       mnt = rpc_get_mount();
+       if (IS_ERR(mnt)) {
+               ret = PTR_ERR(mnt);
+               goto out_remove;
+       }
+
+       ret = vfs_path_lookup(mnt->mnt_root,
+                             mnt,
+                             NFS_PIPE_DIRNAME, 0, &path);
+       if (ret)
+               goto out_remove;
+
+       bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
+                                   &bl_upcall_ops, 0);
+       if (IS_ERR(bl_device_pipe)) {
+               ret = PTR_ERR(bl_device_pipe);
+               goto out_remove;
+       }
+out:
+       return ret;
+
+out_remove:
+       pnfs_unregister_layoutdriver(&blocklayout_type);
+       return ret;
+}
+
+static void __exit nfs4blocklayout_exit(void)
+{
+       dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
+              __func__);
+
+       pnfs_unregister_layoutdriver(&blocklayout_type);
+       rpc_unlink(bl_device_pipe);
+}
+
+MODULE_ALIAS("nfs-layouttype4-3");
+
+module_init(nfs4blocklayout_init);
+module_exit(nfs4blocklayout_exit);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
new file mode 100644 (file)
index 0000000..f27d827
--- /dev/null
@@ -0,0 +1,207 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
+#define FS_NFS_NFS4BLOCKLAYOUT_H
+
+#include <linux/device-mapper.h>
+#include <linux/nfs_fs.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include "../pnfs.h"
+
+#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
+#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+
+struct block_mount_id {
+       spinlock_t                      bm_lock;    /* protects list */
+       struct list_head                bm_devlist; /* holds pnfs_block_dev */
+};
+
+struct pnfs_block_dev {
+       struct list_head                bm_node;
+       struct nfs4_deviceid            bm_mdevid;    /* associated devid */
+       struct block_device             *bm_mdev;     /* meta device itself */
+};
+
+enum exstate4 {
+       PNFS_BLOCK_READWRITE_DATA       = 0,
+       PNFS_BLOCK_READ_DATA            = 1,
+       PNFS_BLOCK_INVALID_DATA         = 2, /* mapped, but data is invalid */
+       PNFS_BLOCK_NONE_DATA            = 3  /* unmapped, it's a hole */
+};
+
+#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
+
+struct my_tree {
+       sector_t                mtt_step_size;  /* Internal sector alignment */
+       struct list_head        mtt_stub; /* Should be a radix tree */
+};
+
+struct pnfs_inval_markings {
+       spinlock_t      im_lock;
+       struct my_tree  im_tree;        /* Sectors that need LAYOUTCOMMIT */
+       sector_t        im_block_size;  /* Server blocksize in sectors */
+};
+
+struct pnfs_inval_tracking {
+       struct list_head it_link;
+       int              it_sector;
+       int              it_tags;
+};
+
+/* sector_t fields are all in 512-byte sectors */
+struct pnfs_block_extent {
+       struct kref     be_refcnt;
+       struct list_head be_node;       /* link into lseg list */
+       struct nfs4_deviceid be_devid;  /* FIXME: could use device cache instead */
+       struct block_device *be_mdev;
+       sector_t        be_f_offset;    /* the starting offset in the file */
+       sector_t        be_length;      /* the size of the extent */
+       sector_t        be_v_offset;    /* the starting offset in the volume */
+       enum exstate4   be_state;       /* the state of this extent */
+       struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
+};
+
+/* Shortened extent used by LAYOUTCOMMIT */
+struct pnfs_block_short_extent {
+       struct list_head bse_node;
+       struct nfs4_deviceid bse_devid;
+       struct block_device *bse_mdev;
+       sector_t        bse_f_offset;   /* the starting offset in the file */
+       sector_t        bse_length;     /* the size of the extent */
+};
+
+static inline void
+BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
+{
+       spin_lock_init(&marks->im_lock);
+       INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
+       marks->im_block_size = blocksize;
+       marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
+                                          blocksize);
+}
+
+enum extentclass4 {
+       RW_EXTENT       = 0, /* READWRTE and INVAL */
+       RO_EXTENT       = 1, /* READ and NONE */
+       EXTENT_LISTS    = 2,
+};
+
+static inline int bl_choose_list(enum exstate4 state)
+{
+       if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
+               return RO_EXTENT;
+       else
+               return RW_EXTENT;
+}
+
+struct pnfs_block_layout {
+       struct pnfs_layout_hdr bl_layout;
+       struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
+       spinlock_t              bl_ext_lock;   /* Protects list manipulation */
+       struct list_head        bl_extents[EXTENT_LISTS]; /* R and RW extents */
+       struct list_head        bl_commit;      /* Needs layout commit */
+       struct list_head        bl_committing;  /* Layout committing */
+       unsigned int            bl_count;       /* entries in bl_commit */
+       sector_t                bl_blocksize;  /* Server blocksize in sectors */
+};
+
+#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
+
+static inline struct pnfs_block_layout *
+BLK_LO2EXT(struct pnfs_layout_hdr *lo)
+{
+       return container_of(lo, struct pnfs_block_layout, bl_layout);
+}
+
+static inline struct pnfs_block_layout *
+BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
+{
+       return BLK_LO2EXT(lseg->pls_layout);
+}
+
+struct bl_dev_msg {
+       int status;
+       uint32_t major, minor;
+};
+
+struct bl_msg_hdr {
+       u8  type;
+       u16 totallen; /* length of entire message, including hdr itself */
+};
+
+extern struct dentry *bl_device_pipe;
+extern wait_queue_head_t bl_wq;
+
+#define BL_DEVICE_UMOUNT               0x0 /* Umount--delete devices */
+#define BL_DEVICE_MOUNT                0x1 /* Mount--create devices*/
+#define BL_DEVICE_REQUEST_INIT         0x0 /* Start request */
+#define BL_DEVICE_REQUEST_PROC         0x1 /* User level process succeeds */
+#define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
+
+/* blocklayoutdev.c */
+ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
+                      char __user *, size_t);
+ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
+void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
+struct block_device *nfs4_blkdev_get(dev_t dev);
+int nfs4_blkdev_put(struct block_device *bdev);
+struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
+                                               struct pnfs_device *dev);
+int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+                               struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
+
+/* blocklayoutdm.c */
+void bl_free_block_dev(struct pnfs_block_dev *bdev);
+
+/* extents.c */
+struct pnfs_block_extent *
+bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
+               struct pnfs_block_extent **cow_read);
+int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
+                            sector_t offset, sector_t length,
+                            sector_t **pages);
+void bl_put_extent(struct pnfs_block_extent *be);
+struct pnfs_block_extent *bl_alloc_extent(void);
+int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
+int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+                                  struct xdr_stream *xdr,
+                                  const struct nfs4_layoutcommit_args *arg);
+void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+                                  const struct nfs4_layoutcommit_args *arg,
+                                  int status);
+int bl_add_merge_extent(struct pnfs_block_layout *bl,
+                        struct pnfs_block_extent *new);
+int bl_mark_for_commit(struct pnfs_block_extent *be,
+                       sector_t offset, sector_t length);
+
+#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
new file mode 100644 (file)
index 0000000..a83b393
--- /dev/null
@@ -0,0 +1,410 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayoutdev.c
+ *
+ *  Device operations for the pnfs nfs4 file layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include <linux/module.h>
+#include <linux/buffer_head.h> /* __bread */
+
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/hash.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+static int decode_sector_number(__be32 **rp, sector_t *sp)
+{
+       uint64_t s;
+
+       *rp = xdr_decode_hyper(*rp, &s);
+       if (s & 0x1ff) {
+               printk(KERN_WARNING "%s: sector not aligned\n", __func__);
+               return -1;
+       }
+       *sp = s >> SECTOR_SHIFT;
+       return 0;
+}
+
+/* Open a block_device by device number. */
+struct block_device *nfs4_blkdev_get(dev_t dev)
+{
+       struct block_device *bd;
+
+       dprintk("%s enter\n", __func__);
+       bd = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+       if (IS_ERR(bd))
+               goto fail;
+       return bd;
+fail:
+       dprintk("%s failed to open device : %ld\n",
+                       __func__, PTR_ERR(bd));
+       return NULL;
+}
+
+/*
+ * Release the block device
+ */
+int nfs4_blkdev_put(struct block_device *bdev)
+{
+       dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
+                       MINOR(bdev->bd_dev));
+       return blkdev_put(bdev, FMODE_READ);
+}
+
+/*
+ * Shouldn't there be a rpc_generic_upcall() to do this for us?
+ */
+ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+                      char __user *dst, size_t buflen)
+{
+       char *data = (char *)msg->data + msg->copied;
+       size_t mlen = min(msg->len - msg->copied, buflen);
+       unsigned long left;
+
+       left = copy_to_user(dst, data, mlen);
+       if (left == mlen) {
+               msg->errno = -EFAULT;
+               return -EFAULT;
+       }
+
+       mlen -= left;
+       msg->copied += mlen;
+       msg->errno = 0;
+       return mlen;
+}
+
+static struct bl_dev_msg bl_mount_reply;
+
+ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+                        size_t mlen)
+{
+       if (mlen != sizeof (struct bl_dev_msg))
+               return -EINVAL;
+
+       if (copy_from_user(&bl_mount_reply, src, mlen) != 0)
+               return -EFAULT;
+
+       wake_up(&bl_wq);
+
+       return mlen;
+}
+
+void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+       if (msg->errno >= 0)
+               return;
+       wake_up(&bl_wq);
+}
+
+/*
+ * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
+ */
+struct pnfs_block_dev *
+nfs4_blk_decode_device(struct nfs_server *server,
+                      struct pnfs_device *dev)
+{
+       struct pnfs_block_dev *rv = NULL;
+       struct block_device *bd = NULL;
+       struct rpc_pipe_msg msg;
+       struct bl_msg_hdr bl_msg = {
+               .type = BL_DEVICE_MOUNT,
+               .totallen = dev->mincount,
+       };
+       uint8_t *dataptr;
+       DECLARE_WAITQUEUE(wq, current);
+       struct bl_dev_msg *reply = &bl_mount_reply;
+       int offset, len, i;
+
+       dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+       dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
+               dev->mincount);
+
+       memset(&msg, 0, sizeof(msg));
+       msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
+       if (!msg.data) {
+               rv = ERR_PTR(-ENOMEM);
+               goto out;
+       }
+
+       memcpy(msg.data, &bl_msg, sizeof(bl_msg));
+       dataptr = (uint8_t *) msg.data;
+       len = dev->mincount;
+       offset = sizeof(bl_msg);
+       for (i = 0; len > 0; i++) {
+               memcpy(&dataptr[offset], page_address(dev->pages[i]),
+                               len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
+               len -= PAGE_CACHE_SIZE;
+               offset += PAGE_CACHE_SIZE;
+       }
+       msg.len = sizeof(bl_msg) + dev->mincount;
+
+       dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+       add_wait_queue(&bl_wq, &wq);
+       if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+               remove_wait_queue(&bl_wq, &wq);
+               goto out;
+       }
+
+       set_current_state(TASK_UNINTERRUPTIBLE);
+       schedule();
+       __set_current_state(TASK_RUNNING);
+       remove_wait_queue(&bl_wq, &wq);
+
+       if (reply->status != BL_DEVICE_REQUEST_PROC) {
+               dprintk("%s failed to open device: %d\n",
+                       __func__, reply->status);
+               rv = ERR_PTR(-EINVAL);
+               goto out;
+       }
+
+       bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
+       if (IS_ERR(bd)) {
+               dprintk("%s failed to open device : %ld\n",
+                       __func__, PTR_ERR(bd));
+               goto out;
+       }
+
+       rv = kzalloc(sizeof(*rv), GFP_NOFS);
+       if (!rv) {
+               rv = ERR_PTR(-ENOMEM);
+               goto out;
+       }
+
+       rv->bm_mdev = bd;
+       memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
+       dprintk("%s Created device %s with bd_block_size %u\n",
+               __func__,
+               bd->bd_disk->disk_name,
+               bd->bd_block_size);
+
+out:
+       kfree(msg.data);
+       return rv;
+}
+
+/* Map deviceid returned by the server to constructed block_device */
+static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
+                                           struct nfs4_deviceid *id)
+{
+       struct block_device *rv = NULL;
+       struct block_mount_id *mid;
+       struct pnfs_block_dev *dev;
+
+       dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
+       mid = BLK_ID(lo);
+       spin_lock(&mid->bm_lock);
+       list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
+               if (memcmp(id->data, dev->bm_mdevid.data,
+                          NFS4_DEVICEID4_SIZE) == 0) {
+                       rv = dev->bm_mdev;
+                       goto out;
+               }
+       }
+ out:
+       spin_unlock(&mid->bm_lock);
+       dprintk("%s returning %p\n", __func__, rv);
+       return rv;
+}
+
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+       u32 mode;       /* R or RW */
+       u64 start;      /* Expected start of next non-COW extent */
+       u64 inval;      /* Start of INVAL coverage */
+       u64 cowread;    /* End of COW read coverage */
+};
+
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+                        struct layout_verification *lv)
+{
+       if (lv->mode == IOMODE_READ) {
+               if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+                   be->be_state == PNFS_BLOCK_INVALID_DATA)
+                       return -EIO;
+               if (be->be_f_offset != lv->start)
+                       return -EIO;
+               lv->start += be->be_length;
+               return 0;
+       }
+       /* lv->mode == IOMODE_RW */
+       if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+               if (be->be_f_offset != lv->start)
+                       return -EIO;
+               if (lv->cowread > lv->start)
+                       return -EIO;
+               lv->start += be->be_length;
+               lv->inval = lv->start;
+               return 0;
+       } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+               if (be->be_f_offset != lv->start)
+                       return -EIO;
+               lv->start += be->be_length;
+               return 0;
+       } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+               if (be->be_f_offset > lv->start)
+                       return -EIO;
+               if (be->be_f_offset < lv->inval)
+                       return -EIO;
+               if (be->be_f_offset < lv->cowread)
+                       return -EIO;
+               /* It looks like you might want to min this with lv->start,
+                * but you really don't.
+                */
+               lv->inval = lv->inval + be->be_length;
+               lv->cowread = be->be_f_offset + be->be_length;
+               return 0;
+       } else
+               return -EIO;
+}
+
+/* XDR decode pnfs_block_layout4 structure */
+int
+nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+                          struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
+{
+       struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+       int i, status = -EIO;
+       uint32_t count;
+       struct pnfs_block_extent *be = NULL, *save;
+       struct xdr_stream stream;
+       struct xdr_buf buf;
+       struct page *scratch;
+       __be32 *p;
+       struct layout_verification lv = {
+               .mode = lgr->range.iomode,
+               .start = lgr->range.offset >> SECTOR_SHIFT,
+               .inval = lgr->range.offset >> SECTOR_SHIFT,
+               .cowread = lgr->range.offset >> SECTOR_SHIFT,
+       };
+       LIST_HEAD(extents);
+
+       dprintk("---> %s\n", __func__);
+
+       scratch = alloc_page(gfp_flags);
+       if (!scratch)
+               return -ENOMEM;
+
+       xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
+       xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+       p = xdr_inline_decode(&stream, 4);
+       if (unlikely(!p))
+               goto out_err;
+
+       count = be32_to_cpup(p++);
+
+       dprintk("%s enter, number of extents %i\n", __func__, count);
+       p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
+       if (unlikely(!p))
+               goto out_err;
+
+       /* Decode individual extents, putting them in temporary
+        * staging area until whole layout is decoded to make error
+        * recovery easier.
+        */
+       for (i = 0; i < count; i++) {
+               be = bl_alloc_extent();
+               if (!be) {
+                       status = -ENOMEM;
+                       goto out_err;
+               }
+               memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
+               p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+               be->be_mdev = translate_devid(lo, &be->be_devid);
+               if (!be->be_mdev)
+                       goto out_err;
+
+               /* The next three values are read in as bytes,
+                * but stored as 512-byte sector lengths
+                */
+               if (decode_sector_number(&p, &be->be_f_offset) < 0)
+                       goto out_err;
+               if (decode_sector_number(&p, &be->be_length) < 0)
+                       goto out_err;
+               if (decode_sector_number(&p, &be->be_v_offset) < 0)
+                       goto out_err;
+               be->be_state = be32_to_cpup(p++);
+               if (be->be_state == PNFS_BLOCK_INVALID_DATA)
+                       be->be_inval = &bl->bl_inval;
+               if (verify_extent(be, &lv)) {
+                       dprintk("%s verify failed\n", __func__);
+                       goto out_err;
+               }
+               list_add_tail(&be->be_node, &extents);
+       }
+       if (lgr->range.offset + lgr->range.length !=
+                       lv.start << SECTOR_SHIFT) {
+               dprintk("%s Final length mismatch\n", __func__);
+               be = NULL;
+               goto out_err;
+       }
+       if (lv.start < lv.cowread) {
+               dprintk("%s Final uncovered COW extent\n", __func__);
+               be = NULL;
+               goto out_err;
+       }
+       /* Extents decoded properly, now try to merge them in to
+        * existing layout extents.
+        */
+       spin_lock(&bl->bl_ext_lock);
+       list_for_each_entry_safe(be, save, &extents, be_node) {
+               list_del(&be->be_node);
+               status = bl_add_merge_extent(bl, be);
+               if (status) {
+                       spin_unlock(&bl->bl_ext_lock);
+                       /* This is a fairly catastrophic error, as the
+                        * entire layout extent lists are now corrupted.
+                        * We should have some way to distinguish this.
+                        */
+                       be = NULL;
+                       goto out_err;
+               }
+       }
+       spin_unlock(&bl->bl_ext_lock);
+       status = 0;
+ out:
+       __free_page(scratch);
+       dprintk("%s returns %i\n", __func__, status);
+       return status;
+
+ out_err:
+       bl_put_extent(be);
+       while (!list_empty(&extents)) {
+               be = list_first_entry(&extents, struct pnfs_block_extent,
+                                     be_node);
+               list_del(&be->be_node);
+               bl_put_extent(be);
+       }
+       goto out;
+}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
new file mode 100644 (file)
index 0000000..d055c75
--- /dev/null
@@ -0,0 +1,111 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayoutdm.c
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2007 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Fred Isaman <iisaman@umich.edu>
+ *  Andy Adamson <andros@citi.umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/genhd.h> /* gendisk - used in a dprintk*/
+#include <linux/sched.h>
+#include <linux/hash.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+static void dev_remove(dev_t dev)
+{
+       struct rpc_pipe_msg msg;
+       struct bl_dev_msg bl_umount_request;
+       struct bl_msg_hdr bl_msg = {
+               .type = BL_DEVICE_UMOUNT,
+               .totallen = sizeof(bl_umount_request),
+       };
+       uint8_t *dataptr;
+       DECLARE_WAITQUEUE(wq, current);
+
+       dprintk("Entering %s\n", __func__);
+
+       memset(&msg, 0, sizeof(msg));
+       msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
+       if (!msg.data)
+               goto out;
+
+       memset(&bl_umount_request, 0, sizeof(bl_umount_request));
+       bl_umount_request.major = MAJOR(dev);
+       bl_umount_request.minor = MINOR(dev);
+
+       memcpy(msg.data, &bl_msg, sizeof(bl_msg));
+       dataptr = (uint8_t *) msg.data;
+       memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
+       msg.len = sizeof(bl_msg) + bl_msg.totallen;
+
+       add_wait_queue(&bl_wq, &wq);
+       if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+               remove_wait_queue(&bl_wq, &wq);
+               goto out;
+       }
+
+       set_current_state(TASK_UNINTERRUPTIBLE);
+       schedule();
+       __set_current_state(TASK_RUNNING);
+       remove_wait_queue(&bl_wq, &wq);
+
+out:
+       kfree(msg.data);
+}
+
+/*
+ * Release meta device
+ */
+static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
+{
+       int rv;
+
+       dprintk("%s Releasing\n", __func__);
+       rv = nfs4_blkdev_put(bdev->bm_mdev);
+       if (rv)
+               printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n",
+                               __func__, rv);
+
+       dev_remove(bdev->bm_mdev->bd_dev);
+}
+
+void bl_free_block_dev(struct pnfs_block_dev *bdev)
+{
+       if (bdev) {
+               if (bdev->bm_mdev) {
+                       dprintk("%s Removing DM device: %d:%d\n",
+                               __func__,
+                               MAJOR(bdev->bm_mdev->bd_dev),
+                               MINOR(bdev->bm_mdev->bd_dev));
+                       nfs4_blk_metadev_release(bdev);
+               }
+               kfree(bdev);
+       }
+}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
new file mode 100644 (file)
index 0000000..19fa7b0
--- /dev/null
@@ -0,0 +1,935 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include "blocklayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+/* Bit numbers */
+#define EXTENT_INITIALIZED 0
+#define EXTENT_WRITTEN     1
+#define EXTENT_IN_COMMIT   2
+#define INTERNAL_EXISTS    MY_MAX_TAGS
+#define INTERNAL_MASK      ((1 << INTERNAL_EXISTS) - 1)
+
+/* Returns largest t<=s s.t. t%base==0 */
+static inline sector_t normalize(sector_t s, int base)
+{
+       sector_t tmp = s; /* Since do_div modifies its argument */
+       return s - do_div(tmp, base);
+}
+
+static inline sector_t normalize_up(sector_t s, int base)
+{
+       return normalize(s + base - 1, base);
+}
+
+/* Complete stub using list while determine API wanted */
+
+/* Returns tags, or negative */
+static int32_t _find_entry(struct my_tree *tree, u64 s)
+{
+       struct pnfs_inval_tracking *pos;
+
+       dprintk("%s(%llu) enter\n", __func__, s);
+       list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+               if (pos->it_sector > s)
+                       continue;
+               else if (pos->it_sector == s)
+                       return pos->it_tags & INTERNAL_MASK;
+               else
+                       break;
+       }
+       return -ENOENT;
+}
+
+static inline
+int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
+{
+       int32_t tags;
+
+       dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
+       s = normalize(s, tree->mtt_step_size);
+       tags = _find_entry(tree, s);
+       if ((tags < 0) || !(tags & (1 << tag)))
+               return 0;
+       else
+               return 1;
+}
+
+/* Creates entry with tag, or if entry already exists, unions tag to it.
+ * If storage is not NULL, newly created entry will use it.
+ * Returns number of entries added, or negative on error.
+ */
+static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
+                     struct pnfs_inval_tracking *storage)
+{
+       int found = 0;
+       struct pnfs_inval_tracking *pos;
+
+       dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
+       list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+               if (pos->it_sector > s)
+                       continue;
+               else if (pos->it_sector == s) {
+                       found = 1;
+                       break;
+               } else
+                       break;
+       }
+       if (found) {
+               pos->it_tags |= (1 << tag);
+               return 0;
+       } else {
+               struct pnfs_inval_tracking *new;
+               if (storage)
+                       new = storage;
+               else {
+                       new = kmalloc(sizeof(*new), GFP_NOFS);
+                       if (!new)
+                               return -ENOMEM;
+               }
+               new->it_sector = s;
+               new->it_tags = (1 << tag);
+               list_add(&new->it_link, &pos->it_link);
+               return 1;
+       }
+}
+
+/* XXXX Really want option to not create */
+/* Over range, unions tag with existing entries, else creates entry with tag */
+static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
+{
+       u64 i;
+
+       dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
+       for (i = normalize(s, tree->mtt_step_size); i < s + length;
+            i += tree->mtt_step_size)
+               if (_add_entry(tree, i, tag, NULL))
+                       return -ENOMEM;
+       return 0;
+}
+
+/* Ensure that future operations on given range of tree will not malloc */
+static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
+{
+       u64 start, end, s;
+       int count, i, used = 0, status = -ENOMEM;
+       struct pnfs_inval_tracking **storage;
+
+       dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
+       start = normalize(offset, tree->mtt_step_size);
+       end = normalize_up(offset + length, tree->mtt_step_size);
+       count = (int)(end - start) / (int)tree->mtt_step_size;
+
+       /* Pre-malloc what memory we might need */
+       storage = kmalloc(sizeof(*storage) * count, GFP_NOFS);
+       if (!storage)
+               return -ENOMEM;
+       for (i = 0; i < count; i++) {
+               storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
+                                    GFP_NOFS);
+               if (!storage[i])
+                       goto out_cleanup;
+       }
+
+       /* Now need lock - HOW??? */
+
+       for (s = start; s < end; s += tree->mtt_step_size)
+               used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
+
+       /* Unlock - HOW??? */
+       status = 0;
+
+ out_cleanup:
+       for (i = used; i < count; i++) {
+               if (!storage[i])
+                       break;
+               kfree(storage[i]);
+       }
+       kfree(storage);
+       return status;
+}
+
+static void set_needs_init(sector_t *array, sector_t offset)
+{
+       sector_t *p = array;
+
+       dprintk("%s enter\n", __func__);
+       if (!p)
+               return;
+       while (*p < offset)
+               p++;
+       if (*p == offset)
+               return;
+       else if (*p == ~0) {
+               *p++ = offset;
+               *p = ~0;
+               return;
+       } else {
+               sector_t *save = p;
+               dprintk("%s Adding %llu\n", __func__, (u64)offset);
+               while (*p != ~0)
+                       p++;
+               p++;
+               memmove(save + 1, save, (char *)p - (char *)save);
+               *save = offset;
+               return;
+       }
+}
+
+/* We are relying on page lock to serialize this */
+int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
+{
+       int rv;
+
+       spin_lock(&marks->im_lock);
+       rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
+       spin_unlock(&marks->im_lock);
+       return rv;
+}
+
+/* Assume start, end already sector aligned */
+static int
+_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
+{
+       struct pnfs_inval_tracking *pos;
+       u64 expect = 0;
+
+       dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
+       list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+               if (pos->it_sector >= end)
+                       continue;
+               if (!expect) {
+                       if ((pos->it_sector == end - tree->mtt_step_size) &&
+                           (pos->it_tags & (1 << tag))) {
+                               expect = pos->it_sector - tree->mtt_step_size;
+                               if (pos->it_sector < tree->mtt_step_size || expect < start)
+                                       return 1;
+                               continue;
+                       } else {
+                               return 0;
+                       }
+               }
+               if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
+                       return 0;
+               expect -= tree->mtt_step_size;
+               if (expect < start)
+                       return 1;
+       }
+       return 0;
+}
+
+static int is_range_written(struct pnfs_inval_markings *marks,
+                           sector_t start, sector_t end)
+{
+       int rv;
+
+       spin_lock(&marks->im_lock);
+       rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
+       spin_unlock(&marks->im_lock);
+       return rv;
+}
+
+/* Marks sectors in [offest, offset_length) as having been initialized.
+ * All lengths are step-aligned, where step is min(pagesize, blocksize).
+ * Notes where partial block is initialized, and helps prepare it for
+ * complete initialization later.
+ */
+/* Currently assumes offset is page-aligned */
+int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
+                            sector_t offset, sector_t length,
+                            sector_t **pages)
+{
+       sector_t s, start, end;
+       sector_t *array = NULL; /* Pages to mark */
+
+       dprintk("%s(offset=%llu,len=%llu) enter\n",
+               __func__, (u64)offset, (u64)length);
+       s = max((sector_t) 3,
+               2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
+       dprintk("%s set max=%llu\n", __func__, (u64)s);
+       if (pages) {
+               array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
+               if (!array)
+                       goto outerr;
+               array[0] = ~0;
+       }
+
+       start = normalize(offset, marks->im_block_size);
+       end = normalize_up(offset + length, marks->im_block_size);
+       if (_preload_range(&marks->im_tree, start, end - start))
+               goto outerr;
+
+       spin_lock(&marks->im_lock);
+
+       for (s = normalize_up(start, PAGE_CACHE_SECTORS);
+            s < offset; s += PAGE_CACHE_SECTORS) {
+               dprintk("%s pre-area pages\n", __func__);
+               /* Portion of used block is not initialized */
+               if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
+                       set_needs_init(array, s);
+       }
+       if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
+               goto out_unlock;
+       for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
+            s < end; s += PAGE_CACHE_SECTORS) {
+               dprintk("%s post-area pages\n", __func__);
+               if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
+                       set_needs_init(array, s);
+       }
+
+       spin_unlock(&marks->im_lock);
+
+       if (pages) {
+               if (array[0] == ~0) {
+                       kfree(array);
+                       *pages = NULL;
+               } else
+                       *pages = array;
+       }
+       return 0;
+
+ out_unlock:
+       spin_unlock(&marks->im_lock);
+ outerr:
+       if (pages) {
+               kfree(array);
+               *pages = NULL;
+       }
+       return -ENOMEM;
+}
+
+/* Marks sectors in [offest, offset+length) as having been written to disk.
+ * All lengths should be block aligned.
+ */
+static int mark_written_sectors(struct pnfs_inval_markings *marks,
+                               sector_t offset, sector_t length)
+{
+       int status;
+
+       dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
+               (u64)offset, (u64)length);
+       spin_lock(&marks->im_lock);
+       status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
+       spin_unlock(&marks->im_lock);
+       return status;
+}
+
+static void print_short_extent(struct pnfs_block_short_extent *be)
+{
+       dprintk("PRINT SHORT EXTENT extent %p\n", be);
+       if (be) {
+               dprintk("        be_f_offset %llu\n", (u64)be->bse_f_offset);
+               dprintk("        be_length   %llu\n", (u64)be->bse_length);
+       }
+}
+
+static void print_clist(struct list_head *list, unsigned int count)
+{
+       struct pnfs_block_short_extent *be;
+       unsigned int i = 0;
+
+       ifdebug(FACILITY) {
+               printk(KERN_DEBUG "****************\n");
+               printk(KERN_DEBUG "Extent list looks like:\n");
+               list_for_each_entry(be, list, bse_node) {
+                       i++;
+                       print_short_extent(be);
+               }
+               if (i != count)
+                       printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
+               printk(KERN_DEBUG "****************\n");
+       }
+}
+
+/* Note: In theory, we should do more checking that devid's match between
+ * old and new, but if they don't, the lists are too corrupt to salvage anyway.
+ */
+/* Note this is very similar to bl_add_merge_extent */
+static void add_to_commitlist(struct pnfs_block_layout *bl,
+                             struct pnfs_block_short_extent *new)
+{
+       struct list_head *clist = &bl->bl_commit;
+       struct pnfs_block_short_extent *old, *save;
+       sector_t end = new->bse_f_offset + new->bse_length;
+
+       dprintk("%s enter\n", __func__);
+       print_short_extent(new);
+       print_clist(clist, bl->bl_count);
+       bl->bl_count++;
+       /* Scan for proper place to insert, extending new to the left
+        * as much as possible.
+        */
+       list_for_each_entry_safe(old, save, clist, bse_node) {
+               if (new->bse_f_offset < old->bse_f_offset)
+                       break;
+               if (end <= old->bse_f_offset + old->bse_length) {
+                       /* Range is already in list */
+                       bl->bl_count--;
+                       kfree(new);
+                       return;
+               } else if (new->bse_f_offset <=
+                               old->bse_f_offset + old->bse_length) {
+                       /* new overlaps or abuts existing be */
+                       if (new->bse_mdev == old->bse_mdev) {
+                               /* extend new to fully replace old */
+                               new->bse_length += new->bse_f_offset -
+                                               old->bse_f_offset;
+                               new->bse_f_offset = old->bse_f_offset;
+                               list_del(&old->bse_node);
+                               bl->bl_count--;
+                               kfree(old);
+                       }
+               }
+       }
+       /* Note that if we never hit the above break, old will not point to a
+        * valid extent.  However, in that case &old->bse_node==list.
+        */
+       list_add_tail(&new->bse_node, &old->bse_node);
+       /* Scan forward for overlaps.  If we find any, extend new and
+        * remove the overlapped extent.
+        */
+       old = list_prepare_entry(new, clist, bse_node);
+       list_for_each_entry_safe_continue(old, save, clist, bse_node) {
+               if (end < old->bse_f_offset)
+                       break;
+               /* new overlaps or abuts old */
+               if (new->bse_mdev == old->bse_mdev) {
+                       if (end < old->bse_f_offset + old->bse_length) {
+                               /* extend new to fully cover old */
+                               end = old->bse_f_offset + old->bse_length;
+                               new->bse_length = end - new->bse_f_offset;
+                       }
+                       list_del(&old->bse_node);
+                       bl->bl_count--;
+                       kfree(old);
+               }
+       }
+       dprintk("%s: after merging\n", __func__);
+       print_clist(clist, bl->bl_count);
+}
+
+/* Note the range described by offset, length is guaranteed to be contained
+ * within be.
+ */
+int bl_mark_for_commit(struct pnfs_block_extent *be,
+                   sector_t offset, sector_t length)
+{
+       sector_t new_end, end = offset + length;
+       struct pnfs_block_short_extent *new;
+       struct pnfs_block_layout *bl = container_of(be->be_inval,
+                                                   struct pnfs_block_layout,
+                                                   bl_inval);
+
+       new = kmalloc(sizeof(*new), GFP_NOFS);
+       if (!new)
+               return -ENOMEM;
+
+       mark_written_sectors(be->be_inval, offset, length);
+       /* We want to add the range to commit list, but it must be
+        * block-normalized, and verified that the normalized range has
+        * been entirely written to disk.
+        */
+       new->bse_f_offset = offset;
+       offset = normalize(offset, bl->bl_blocksize);
+       if (offset < new->bse_f_offset) {
+               if (is_range_written(be->be_inval, offset, new->bse_f_offset))
+                       new->bse_f_offset = offset;
+               else
+                       new->bse_f_offset = offset + bl->bl_blocksize;
+       }
+       new_end = normalize_up(end, bl->bl_blocksize);
+       if (end < new_end) {
+               if (is_range_written(be->be_inval, end, new_end))
+                       end = new_end;
+               else
+                       end = new_end - bl->bl_blocksize;
+       }
+       if (end <= new->bse_f_offset) {
+               kfree(new);
+               return 0;
+       }
+       new->bse_length = end - new->bse_f_offset;
+       new->bse_devid = be->be_devid;
+       new->bse_mdev = be->be_mdev;
+
+       spin_lock(&bl->bl_ext_lock);
+       /* new will be freed, either by add_to_commitlist if it decides not
+        * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
+        */
+       add_to_commitlist(bl, new);
+       spin_unlock(&bl->bl_ext_lock);
+       return 0;
+}
+
+static void print_bl_extent(struct pnfs_block_extent *be)
+{
+       dprintk("PRINT EXTENT extent %p\n", be);
+       if (be) {
+               dprintk("        be_f_offset %llu\n", (u64)be->be_f_offset);
+               dprintk("        be_length   %llu\n", (u64)be->be_length);
+               dprintk("        be_v_offset %llu\n", (u64)be->be_v_offset);
+               dprintk("        be_state    %d\n", be->be_state);
+       }
+}
+
+static void
+destroy_extent(struct kref *kref)
+{
+       struct pnfs_block_extent *be;
+
+       be = container_of(kref, struct pnfs_block_extent, be_refcnt);
+       dprintk("%s be=%p\n", __func__, be);
+       kfree(be);
+}
+
+void
+bl_put_extent(struct pnfs_block_extent *be)
+{
+       if (be) {
+               dprintk("%s enter %p (%i)\n", __func__, be,
+                       atomic_read(&be->be_refcnt.refcount));
+               kref_put(&be->be_refcnt, destroy_extent);
+       }
+}
+
+struct pnfs_block_extent *bl_alloc_extent(void)
+{
+       struct pnfs_block_extent *be;
+
+       be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
+       if (!be)
+               return NULL;
+       INIT_LIST_HEAD(&be->be_node);
+       kref_init(&be->be_refcnt);
+       be->be_inval = NULL;
+       return be;
+}
+
+static void print_elist(struct list_head *list)
+{
+       struct pnfs_block_extent *be;
+       dprintk("****************\n");
+       dprintk("Extent list looks like:\n");
+       list_for_each_entry(be, list, be_node) {
+               print_bl_extent(be);
+       }
+       dprintk("****************\n");
+}
+
+static inline int
+extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
+{
+       /* Note this assumes new->be_f_offset >= old->be_f_offset */
+       return (new->be_state == old->be_state) &&
+               ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
+                ((new->be_v_offset - old->be_v_offset ==
+                  new->be_f_offset - old->be_f_offset) &&
+                 new->be_mdev == old->be_mdev));
+}
+
+/* Adds new to appropriate list in bl, modifying new and removing existing
+ * extents as appropriate to deal with overlaps.
+ *
+ * See bl_find_get_extent for list constraints.
+ *
+ * Refcount on new is already set.  If end up not using it, or error out,
+ * need to put the reference.
+ *
+ * bl->bl_ext_lock is held by caller.
+ */
+int
+bl_add_merge_extent(struct pnfs_block_layout *bl,
+                    struct pnfs_block_extent *new)
+{
+       struct pnfs_block_extent *be, *tmp;
+       sector_t end = new->be_f_offset + new->be_length;
+       struct list_head *list;
+
+       dprintk("%s enter with be=%p\n", __func__, new);
+       print_bl_extent(new);
+       list = &bl->bl_extents[bl_choose_list(new->be_state)];
+       print_elist(list);
+
+       /* Scan for proper place to insert, extending new to the left
+        * as much as possible.
+        */
+       list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
+               if (new->be_f_offset >= be->be_f_offset + be->be_length)
+                       break;
+               if (new->be_f_offset >= be->be_f_offset) {
+                       if (end <= be->be_f_offset + be->be_length) {
+                               /* new is a subset of existing be*/
+                               if (extents_consistent(be, new)) {
+                                       dprintk("%s: new is subset, ignoring\n",
+                                               __func__);
+                                       bl_put_extent(new);
+                                       return 0;
+                               } else {
+                                       goto out_err;
+                               }
+                       } else {
+                               /* |<--   be   -->|
+                                *          |<--   new   -->| */
+                               if (extents_consistent(be, new)) {
+                                       /* extend new to fully replace be */
+                                       new->be_length += new->be_f_offset -
+                                               be->be_f_offset;
+                                       new->be_f_offset = be->be_f_offset;
+                                       new->be_v_offset = be->be_v_offset;
+                                       dprintk("%s: removing %p\n", __func__, be);
+                                       list_del(&be->be_node);
+                                       bl_put_extent(be);
+                               } else {
+                                       goto out_err;
+                               }
+                       }
+               } else if (end >= be->be_f_offset + be->be_length) {
+                       /* new extent overlap existing be */
+                       if (extents_consistent(be, new)) {
+                               /* extend new to fully replace be */
+                               dprintk("%s: removing %p\n", __func__, be);
+                               list_del(&be->be_node);
+                               bl_put_extent(be);
+                       } else {
+                               goto out_err;
+                       }
+               } else if (end > be->be_f_offset) {
+                       /*           |<--   be   -->|
+                        *|<--   new   -->| */
+                       if (extents_consistent(new, be)) {
+                               /* extend new to fully replace be */
+                               new->be_length += be->be_f_offset + be->be_length -
+                                       new->be_f_offset - new->be_length;
+                               dprintk("%s: removing %p\n", __func__, be);
+                               list_del(&be->be_node);
+                               bl_put_extent(be);
+                       } else {
+                               goto out_err;
+                       }
+               }
+       }
+       /* Note that if we never hit the above break, be will not point to a
+        * valid extent.  However, in that case &be->be_node==list.
+        */
+       list_add(&new->be_node, &be->be_node);
+       dprintk("%s: inserting new\n", __func__);
+       print_elist(list);
+       /* FIXME - The per-list consistency checks have all been done,
+        * should now check cross-list consistency.
+        */
+       return 0;
+
+ out_err:
+       bl_put_extent(new);
+       return -EIO;
+}
+
+/* Returns extent, or NULL.  If a second READ extent exists, it is returned
+ * in cow_read, if given.
+ *
+ * The extents are kept in two seperate ordered lists, one for READ and NONE,
+ * one for READWRITE and INVALID.  Within each list, we assume:
+ * 1. Extents are ordered by file offset.
+ * 2. For any given isect, there is at most one extents that matches.
+ */
+struct pnfs_block_extent *
+bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
+           struct pnfs_block_extent **cow_read)
+{
+       struct pnfs_block_extent *be, *cow, *ret;
+       int i;
+
+       dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
+       cow = ret = NULL;
+       spin_lock(&bl->bl_ext_lock);
+       for (i = 0; i < EXTENT_LISTS; i++) {
+               list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+                       if (isect >= be->be_f_offset + be->be_length)
+                               break;
+                       if (isect >= be->be_f_offset) {
+                               /* We have found an extent */
+                               dprintk("%s Get %p (%i)\n", __func__, be,
+                                       atomic_read(&be->be_refcnt.refcount));
+                               kref_get(&be->be_refcnt);
+                               if (!ret)
+                                       ret = be;
+                               else if (be->be_state != PNFS_BLOCK_READ_DATA)
+                                       bl_put_extent(be);
+                               else
+                                       cow = be;
+                               break;
+                       }
+               }
+               if (ret &&
+                   (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
+                       break;
+       }
+       spin_unlock(&bl->bl_ext_lock);
+       if (cow_read)
+               *cow_read = cow;
+       print_bl_extent(ret);
+       return ret;
+}
+
+/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
+static struct pnfs_block_extent *
+bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
+{
+       struct pnfs_block_extent *be, *ret = NULL;
+       int i;
+
+       dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
+       for (i = 0; i < EXTENT_LISTS; i++) {
+               if (ret)
+                       break;
+               list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+                       if (isect >= be->be_f_offset + be->be_length)
+                               break;
+                       if (isect >= be->be_f_offset) {
+                               /* We have found an extent */
+                               dprintk("%s Get %p (%i)\n", __func__, be,
+                                       atomic_read(&be->be_refcnt.refcount));
+                               kref_get(&be->be_refcnt);
+                               ret = be;
+                               break;
+                       }
+               }
+       }
+       print_bl_extent(ret);
+       return ret;
+}
+
+int
+encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+                              struct xdr_stream *xdr,
+                              const struct nfs4_layoutcommit_args *arg)
+{
+       struct pnfs_block_short_extent *lce, *save;
+       unsigned int count = 0;
+       __be32 *p, *xdr_start;
+
+       dprintk("%s enter\n", __func__);
+       /* BUG - creation of bl_commit is buggy - need to wait for
+        * entire block to be marked WRITTEN before it can be added.
+        */
+       spin_lock(&bl->bl_ext_lock);
+       /* Want to adjust for possible truncate */
+       /* We now want to adjust argument range */
+
+       /* XDR encode the ranges found */
+       xdr_start = xdr_reserve_space(xdr, 8);
+       if (!xdr_start)
+               goto out;
+       list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
+               p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
+               if (!p)
+                       break;
+               p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
+               p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
+               p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
+               p = xdr_encode_hyper(p, 0LL);
+               *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+               list_del(&lce->bse_node);
+               list_add_tail(&lce->bse_node, &bl->bl_committing);
+               bl->bl_count--;
+               count++;
+       }
+       xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
+       xdr_start[1] = cpu_to_be32(count);
+out:
+       spin_unlock(&bl->bl_ext_lock);
+       dprintk("%s found %i ranges\n", __func__, count);
+       return 0;
+}
+
+/* Helper function to set_to_rw that initialize a new extent */
+static void
+_prep_new_extent(struct pnfs_block_extent *new,
+                struct pnfs_block_extent *orig,
+                sector_t offset, sector_t length, int state)
+{
+       kref_init(&new->be_refcnt);
+       /* don't need to INIT_LIST_HEAD(&new->be_node) */
+       memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
+       new->be_mdev = orig->be_mdev;
+       new->be_f_offset = offset;
+       new->be_length = length;
+       new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
+       new->be_state = state;
+       new->be_inval = orig->be_inval;
+}
+
+/* Tries to merge be with extent in front of it in list.
+ * Frees storage if not used.
+ */
+static struct pnfs_block_extent *
+_front_merge(struct pnfs_block_extent *be, struct list_head *head,
+            struct pnfs_block_extent *storage)
+{
+       struct pnfs_block_extent *prev;
+
+       if (!storage)
+               goto no_merge;
+       if (&be->be_node == head || be->be_node.prev == head)
+               goto no_merge;
+       prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
+       if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
+           !extents_consistent(prev, be))
+               goto no_merge;
+       _prep_new_extent(storage, prev, prev->be_f_offset,
+                        prev->be_length + be->be_length, prev->be_state);
+       list_replace(&prev->be_node, &storage->be_node);
+       bl_put_extent(prev);
+       list_del(&be->be_node);
+       bl_put_extent(be);
+       return storage;
+
+ no_merge:
+       kfree(storage);
+       return be;
+}
+
+static u64
+set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
+{
+       u64 rv = offset + length;
+       struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
+       struct pnfs_block_extent *children[3];
+       struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
+       int i = 0, j;
+
+       dprintk("%s(%llu, %llu)\n", __func__, offset, length);
+       /* Create storage for up to three new extents e1, e2, e3 */
+       e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
+       e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
+       e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
+       /* BUG - we are ignoring any failure */
+       if (!e1 || !e2 || !e3)
+               goto out_nosplit;
+
+       spin_lock(&bl->bl_ext_lock);
+       be = bl_find_get_extent_locked(bl, offset);
+       rv = be->be_f_offset + be->be_length;
+       if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
+               spin_unlock(&bl->bl_ext_lock);
+               goto out_nosplit;
+       }
+       /* Add e* to children, bumping e*'s krefs */
+       if (be->be_f_offset != offset) {
+               _prep_new_extent(e1, be, be->be_f_offset,
+                                offset - be->be_f_offset,
+                                PNFS_BLOCK_INVALID_DATA);
+               children[i++] = e1;
+               print_bl_extent(e1);
+       } else
+               merge1 = e1;
+       _prep_new_extent(e2, be, offset,
+                        min(length, be->be_f_offset + be->be_length - offset),
+                        PNFS_BLOCK_READWRITE_DATA);
+       children[i++] = e2;
+       print_bl_extent(e2);
+       if (offset + length < be->be_f_offset + be->be_length) {
+               _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
+                                be->be_f_offset + be->be_length -
+                                offset - length,
+                                PNFS_BLOCK_INVALID_DATA);
+               children[i++] = e3;
+               print_bl_extent(e3);
+       } else
+               merge2 = e3;
+
+       /* Remove be from list, and insert the e* */
+       /* We don't get refs on e*, since this list is the base reference
+        * set when init'ed.
+        */
+       if (i < 3)
+               children[i] = NULL;
+       new = children[0];
+       list_replace(&be->be_node, &new->be_node);
+       bl_put_extent(be);
+       new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
+       for (j = 1; j < i; j++) {
+               old = new;
+               new = children[j];
+               list_add(&new->be_node, &old->be_node);
+       }
+       if (merge2) {
+               /* This is a HACK, should just create a _back_merge function */
+               new = list_entry(new->be_node.next,
+                                struct pnfs_block_extent, be_node);
+               new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
+       }
+       spin_unlock(&bl->bl_ext_lock);
+
+       /* Since we removed the base reference above, be is now scheduled for
+        * destruction.
+        */
+       bl_put_extent(be);
+       dprintk("%s returns %llu after split\n", __func__, rv);
+       return rv;
+
+ out_nosplit:
+       kfree(e1);
+       kfree(e2);
+       kfree(e3);
+       dprintk("%s returns %llu without splitting\n", __func__, rv);
+       return rv;
+}
+
+void
+clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+                             const struct nfs4_layoutcommit_args *arg,
+                             int status)
+{
+       struct pnfs_block_short_extent *lce, *save;
+
+       dprintk("%s status %d\n", __func__, status);
+       list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
+               if (likely(!status)) {
+                       u64 offset = lce->bse_f_offset;
+                       u64 end = offset + lce->bse_length;
+
+                       do {
+                               offset = set_to_rw(bl, offset, end - offset);
+                       } while (offset < end);
+                       list_del(&lce->bse_node);
+
+                       kfree(lce);
+               } else {
+                       list_del(&lce->bse_node);
+                       spin_lock(&bl->bl_ext_lock);
+                       add_to_commitlist(bl, lce);
+                       spin_unlock(&bl->bl_ext_lock);
+               }
+       }
+}
index 19ea7d9..5833fbb 100644 (file)
@@ -105,7 +105,7 @@ struct rpc_program nfs_program = {
        .nrvers                 = ARRAY_SIZE(nfs_version),
        .version                = nfs_version,
        .stats                  = &nfs_rpcstat,
-       .pipe_dir_name          = "/nfs",
+       .pipe_dir_name          = NFS_PIPE_DIRNAME,
 };
 
 struct rpc_stat nfs_rpcstat = {
@@ -904,7 +904,9 @@ error:
 /*
  * Load up the server record from information gained in an fsinfo record
  */
-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo)
+static void nfs_server_set_fsinfo(struct nfs_server *server,
+                                 struct nfs_fh *mntfh,
+                                 struct nfs_fsinfo *fsinfo)
 {
        unsigned long max_rpc_payload;
 
@@ -934,7 +936,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
                server->wsize = NFS_MAX_FILE_IO_SIZE;
        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-       set_pnfs_layoutdriver(server, fsinfo->layouttype);
+       server->pnfs_blksize = fsinfo->blksize;
+       set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
 
        server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
 
@@ -980,7 +983,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
        if (error < 0)
                goto out_error;
 
-       nfs_server_set_fsinfo(server, &fsinfo);
+       nfs_server_set_fsinfo(server, mntfh, &fsinfo);
 
        /* Get some general file system info */
        if (server->namelen == 0) {
index 57f578e..b238d95 100644 (file)
@@ -134,18 +134,19 @@ const struct inode_operations nfs4_dir_inode_operations = {
 
 #endif /* CONFIG_NFS_V4 */
 
-static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred)
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
 {
        struct nfs_open_dir_context *ctx;
        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
        if (ctx != NULL) {
                ctx->duped = 0;
+               ctx->attr_gencount = NFS_I(dir)->attr_gencount;
                ctx->dir_cookie = 0;
                ctx->dup_cookie = 0;
                ctx->cred = get_rpccred(cred);
-       } else
-               ctx = ERR_PTR(-ENOMEM);
-       return ctx;
+               return ctx;
+       }
+       return  ERR_PTR(-ENOMEM);
 }
 
 static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
@@ -173,7 +174,7 @@ nfs_opendir(struct inode *inode, struct file *filp)
        cred = rpc_lookup_cred();
        if (IS_ERR(cred))
                return PTR_ERR(cred);
-       ctx = alloc_nfs_open_dir_context(cred);
+       ctx = alloc_nfs_open_dir_context(inode, cred);
        if (IS_ERR(ctx)) {
                res = PTR_ERR(ctx);
                goto out;
@@ -323,7 +324,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
 {
        loff_t diff = desc->file->f_pos - desc->current_index;
        unsigned int index;
-       struct nfs_open_dir_context *ctx = desc->file->private_data;
 
        if (diff < 0)
                goto out_eof;
@@ -336,7 +336,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
        index = (unsigned int)diff;
        *desc->dir_cookie = array->array[index].cookie;
        desc->cache_entry_index = index;
-       ctx->duped = 0;
        return 0;
 out_eof:
        desc->eof = 1;
@@ -349,14 +348,34 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
        int i;
        loff_t new_pos;
        int status = -EAGAIN;
-       struct nfs_open_dir_context *ctx = desc->file->private_data;
 
        for (i = 0; i < array->size; i++) {
                if (array->array[i].cookie == *desc->dir_cookie) {
+                       struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode);
+                       struct nfs_open_dir_context *ctx = desc->file->private_data;
+
                        new_pos = desc->current_index + i;
-                       if (new_pos < desc->file->f_pos) {
+                       if (ctx->attr_gencount != nfsi->attr_gencount
+                           || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
+                               ctx->duped = 0;
+                               ctx->attr_gencount = nfsi->attr_gencount;
+                       } else if (new_pos < desc->file->f_pos) {
+                               if (ctx->duped > 0
+                                   && ctx->dup_cookie == *desc->dir_cookie) {
+                                       if (printk_ratelimit()) {
+                                               pr_notice("NFS: directory %s/%s contains a readdir loop."
+                                                               "Please contact your server vendor.  "
+                                                               "The file: %s has duplicate cookie %llu\n",
+                                                               desc->file->f_dentry->d_parent->d_name.name,
+                                                               desc->file->f_dentry->d_name.name,
+                                                               array->array[i].string.name,
+                                                               *desc->dir_cookie);
+                                       }
+                                       status = -ELOOP;
+                                       goto out;
+                               }
                                ctx->dup_cookie = *desc->dir_cookie;
-                               ctx->duped = 1;
+                               ctx->duped = -1;
                        }
                        desc->file->f_pos = new_pos;
                        desc->cache_entry_index = i;
@@ -368,6 +387,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
                if (*desc->dir_cookie == array->last_cookie)
                        desc->eof = 1;
        }
+out:
        return status;
 }
 
@@ -740,19 +760,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
        struct nfs_cache_array *array = NULL;
        struct nfs_open_dir_context *ctx = file->private_data;
 
-       if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) {
-               if (printk_ratelimit()) {
-                       pr_notice("NFS: directory %s/%s contains a readdir loop.  "
-                               "Please contact your server vendor.  "
-                               "Offending cookie: %llu\n",
-                               file->f_dentry->d_parent->d_name.name,
-                               file->f_dentry->d_name.name,
-                               *desc->dir_cookie);
-               }
-               res = -ELOOP;
-               goto out;
-       }
-
        array = nfs_readdir_get_array(desc->page);
        if (IS_ERR(array)) {
                res = PTR_ERR(array);
@@ -774,6 +781,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
                        *desc->dir_cookie = array->array[i+1].cookie;
                else
                        *desc->dir_cookie = array->last_cookie;
+               if (ctx->duped != 0)
+                       ctx->duped = 1;
        }
        if (array->eof_index >= 0)
                desc->eof = 1;
@@ -805,6 +814,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
        struct page     *page = NULL;
        int             status;
        struct inode *inode = desc->file->f_path.dentry->d_inode;
+       struct nfs_open_dir_context *ctx = desc->file->private_data;
 
        dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
                        (unsigned long long)*desc->dir_cookie);
@@ -818,6 +828,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
        desc->page_index = 0;
        desc->last_cookie = *desc->dir_cookie;
        desc->page = page;
+       ctx->duped = 0;
 
        status = nfs_readdir_xdr_to_array(desc, page, inode);
        if (status < 0)
index 1909ee8..1ec1a85 100644 (file)
@@ -318,7 +318,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
 extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
 extern const u32 nfs4_pathconf_bitmap[2];
-extern const u32 nfs4_fsinfo_bitmap[2];
+extern const u32 nfs4_fsinfo_bitmap[3];
 extern const u32 nfs4_fs_locations_bitmap[2];
 
 /* nfs4renewd.c */
index be93a62..e8915d4 100644 (file)
@@ -170,7 +170,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
 
        pnfs_set_layoutcommit(wdata);
        dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
-               (unsigned long) wdata->lseg->pls_end_pos);
+               (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb);
 }
 
 /*
index 079614d..8c77039 100644 (file)
@@ -140,12 +140,13 @@ const u32 nfs4_pathconf_bitmap[2] = {
        0
 };
 
-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
+const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
                        | FATTR4_WORD0_MAXREAD
                        | FATTR4_WORD0_MAXWRITE
                        | FATTR4_WORD0_LEASE_TIME,
                        FATTR4_WORD1_TIME_DELTA
-                       | FATTR4_WORD1_FS_LAYOUT_TYPES
+                       | FATTR4_WORD1_FS_LAYOUT_TYPES,
+                       FATTR4_WORD2_LAYOUT_BLKSIZE
 };
 
 const u32 nfs4_fs_locations_bitmap[2] = {
@@ -5834,6 +5835,54 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
        return status;
 }
 
+/*
+ * Retrieve the list of Data Server devices from the MDS.
+ */
+static int _nfs4_getdevicelist(struct nfs_server *server,
+                                   const struct nfs_fh *fh,
+                                   struct pnfs_devicelist *devlist)
+{
+       struct nfs4_getdevicelist_args args = {
+               .fh = fh,
+               .layoutclass = server->pnfs_curr_ld->id,
+       };
+       struct nfs4_getdevicelist_res res = {
+               .devlist = devlist,
+       };
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
+               .rpc_argp = &args,
+               .rpc_resp = &res,
+       };
+       int status;
+
+       dprintk("--> %s\n", __func__);
+       status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
+                               &res.seq_res, 0);
+       dprintk("<-- %s status=%d\n", __func__, status);
+       return status;
+}
+
+int nfs4_proc_getdevicelist(struct nfs_server *server,
+                           const struct nfs_fh *fh,
+                           struct pnfs_devicelist *devlist)
+{
+       struct nfs4_exception exception = { };
+       int err;
+
+       do {
+               err = nfs4_handle_exception(server,
+                               _nfs4_getdevicelist(server, fh, devlist),
+                               &exception);
+       } while (exception.retry);
+
+       dprintk("%s: err=%d, num_devs=%u\n", __func__,
+               err, devlist->num_devs);
+
+       return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
+
 static int
 _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
 {
@@ -5912,9 +5961,16 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
 static void nfs4_layoutcommit_release(void *calldata)
 {
        struct nfs4_layoutcommit_data *data = calldata;
+       struct pnfs_layout_segment *lseg, *tmp;
 
+       pnfs_cleanup_layoutcommit(data);
        /* Matched by references in pnfs_set_layoutcommit */
-       put_lseg(data->lseg);
+       list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) {
+               list_del_init(&lseg->pls_lc_list);
+               if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT,
+                                      &lseg->pls_flags))
+                       put_lseg(lseg);
+       }
        put_rpccred(data->cred);
        kfree(data);
 }
index c191a9b..1dce12f 100644 (file)
@@ -113,7 +113,11 @@ static int nfs4_stat_to_errno(int);
 #define encode_restorefh_maxsz  (op_encode_hdr_maxsz)
 #define decode_restorefh_maxsz  (op_decode_hdr_maxsz)
 #define encode_fsinfo_maxsz    (encode_getattr_maxsz)
-#define decode_fsinfo_maxsz    (op_decode_hdr_maxsz + 15)
+/* The 5 accounts for the PNFS attributes, and assumes that at most three
+ * layout types will be returned.
+ */
+#define decode_fsinfo_maxsz    (op_decode_hdr_maxsz + \
+                                nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)
 #define encode_renew_maxsz     (op_encode_hdr_maxsz + 3)
 #define decode_renew_maxsz     (op_decode_hdr_maxsz)
 #define encode_setclientid_maxsz \
@@ -314,6 +318,17 @@ static int nfs4_stat_to_errno(int);
                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz  (op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz  (op_decode_hdr_maxsz + 4)
+#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
+                               encode_verifier_maxsz)
+#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
+                               2 /* nfs_cookie4 gdlr_cookie */ + \
+                               decode_verifier_maxsz \
+                                 /* verifier4 gdlr_verifier */ + \
+                               1 /* gdlr_deviceid_list count */ + \
+                               XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
+                                           NFS4_DEVICEID4_SIZE) \
+                                 /* gdlr_deviceid_list */ + \
+                               1 /* bool gdlr_eof */)
 #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
                                XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
 #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
@@ -748,6 +763,14 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_reclaim_complete_sz   (compound_decode_hdr_maxsz + \
                                         decode_sequence_maxsz + \
                                         decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
+                               encode_sequence_maxsz + \
+                               encode_putfh_maxsz + \
+                               encode_getdevicelist_maxsz)
+#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
+                               decode_sequence_maxsz + \
+                               decode_putfh_maxsz + \
+                               decode_getdevicelist_maxsz)
 #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
                                encode_sequence_maxsz +\
                                encode_getdeviceinfo_maxsz)
@@ -1104,6 +1127,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
        hdr->replen += decode_getattr_maxsz;
 }
 
+static void
+encode_getattr_three(struct xdr_stream *xdr,
+                    uint32_t bm0, uint32_t bm1, uint32_t bm2,
+                    struct compound_hdr *hdr)
+{
+       __be32 *p;
+
+       p = reserve_space(xdr, 4);
+       *p = cpu_to_be32(OP_GETATTR);
+       if (bm2) {
+               p = reserve_space(xdr, 16);
+               *p++ = cpu_to_be32(3);
+               *p++ = cpu_to_be32(bm0);
+               *p++ = cpu_to_be32(bm1);
+               *p = cpu_to_be32(bm2);
+       } else if (bm1) {
+               p = reserve_space(xdr, 12);
+               *p++ = cpu_to_be32(2);
+               *p++ = cpu_to_be32(bm0);
+               *p = cpu_to_be32(bm1);
+       } else {
+               p = reserve_space(xdr, 8);
+               *p++ = cpu_to_be32(1);
+               *p = cpu_to_be32(bm0);
+       }
+       hdr->nops++;
+       hdr->replen += decode_getattr_maxsz;
+}
+
 static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
        encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
@@ -1112,8 +1164,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c
 
 static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-       encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
-                          bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
+       encode_getattr_three(xdr,
+                            bitmask[0] & nfs4_fsinfo_bitmap[0],
+                            bitmask[1] & nfs4_fsinfo_bitmap[1],
+                            bitmask[2] & nfs4_fsinfo_bitmap[2],
+                            hdr);
 }
 
 static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -1854,6 +1909,26 @@ static void encode_sequence(struct xdr_stream *xdr,
 }
 
 #ifdef CONFIG_NFS_V4_1
+static void
+encode_getdevicelist(struct xdr_stream *xdr,
+                    const struct nfs4_getdevicelist_args *args,
+                    struct compound_hdr *hdr)
+{
+       __be32 *p;
+       nfs4_verifier dummy = {
+               .data = "dummmmmy",
+       };
+
+       p = reserve_space(xdr, 20);
+       *p++ = cpu_to_be32(OP_GETDEVICELIST);
+       *p++ = cpu_to_be32(args->layoutclass);
+       *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
+       xdr_encode_hyper(p, 0ULL);                          /* cookie */
+       encode_nfs4_verifier(xdr, &dummy);
+       hdr->nops++;
+       hdr->replen += decode_getdevicelist_maxsz;
+}
+
 static void
 encode_getdeviceinfo(struct xdr_stream *xdr,
                     const struct nfs4_getdeviceinfo_args *args,
@@ -1916,7 +1991,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
        *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
        /* Only whole file layouts */
        p = xdr_encode_hyper(p, 0); /* offset */
-       p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */
+       p = xdr_encode_hyper(p, args->lastbytewritten + 1);     /* length */
        *p++ = cpu_to_be32(0); /* reclaim */
        p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
        *p++ = cpu_to_be32(1); /* newoffset = TRUE */
@@ -2604,7 +2679,7 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-       const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+       const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
 
        encode_compound_hdr(xdr, req, &hdr);
        encode_setclientid_confirm(xdr, arg, &hdr);
@@ -2748,7 +2823,7 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
        };
-       const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+       const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
 
        encode_compound_hdr(xdr, req, &hdr);
        encode_sequence(xdr, &args->la_seq_args, &hdr);
@@ -2774,6 +2849,24 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
        encode_nops(&hdr);
 }
 
+/*
+ * Encode GETDEVICELIST request
+ */
+static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      struct nfs4_getdevicelist_args *args)
+{
+       struct compound_hdr hdr = {
+               .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+       };
+
+       encode_compound_hdr(xdr, req, &hdr);
+       encode_sequence(xdr, &args->seq_args, &hdr);
+       encode_putfh(xdr, args->fh, &hdr);
+       encode_getdevicelist(xdr, args, &hdr);
+       encode_nops(&hdr);
+}
+
 /*
  * Encode GETDEVICEINFO request
  */
@@ -3011,14 +3104,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
                goto out_overflow;
        bmlen = be32_to_cpup(p);
 
-       bitmap[0] = bitmap[1] = 0;
+       bitmap[0] = bitmap[1] = bitmap[2] = 0;
        p = xdr_inline_decode(xdr, (bmlen << 2));
        if (unlikely(!p))
                goto out_overflow;
        if (bmlen > 0) {
                bitmap[0] = be32_to_cpup(p++);
-               if (bmlen > 1)
-                       bitmap[1] = be32_to_cpup(p);
+               if (bmlen > 1) {
+                       bitmap[1] = be32_to_cpup(p++);
+                       if (bmlen > 2)
+                               bitmap[2] = be32_to_cpup(p);
+               }
        }
        return 0;
 out_overflow:
@@ -3050,8 +3146,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
                        return ret;
                bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
        } else
-               bitmask[0] = bitmask[1] = 0;
-       dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]);
+               bitmask[0] = bitmask[1] = bitmask[2] = 0;
+       dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+               bitmask[0], bitmask[1], bitmask[2]);
        return 0;
 }
 
@@ -4105,7 +4202,7 @@ out_overflow:
 static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
 {
        __be32 *savep;
-       uint32_t attrlen, bitmap[2] = {0};
+       uint32_t attrlen, bitmap[3] = {0};
        int status;
 
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4131,7 +4228,7 @@ xdr_error:
 static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
 {
        __be32 *savep;
-       uint32_t attrlen, bitmap[2] = {0};
+       uint32_t attrlen, bitmap[3] = {0};
        int status;
 
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4163,7 +4260,7 @@ xdr_error:
 static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
 {
        __be32 *savep;
-       uint32_t attrlen, bitmap[2] = {0};
+       uint32_t attrlen, bitmap[3] = {0};
        int status;
 
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4303,7 +4400,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
 {
        __be32 *savep;
        uint32_t attrlen,
-                bitmap[2] = {0};
+                bitmap[3] = {0};
        int status;
 
        status = decode_op_hdr(xdr, OP_GETATTR);
@@ -4389,10 +4486,32 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
        return status;
 }
 
+/*
+ * The prefered block size for layout directed io
+ */
+static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+                                     uint32_t *res)
+{
+       __be32 *p;
+
+       dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+       *res = 0;
+       if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+               p = xdr_inline_decode(xdr, 4);
+               if (unlikely(!p)) {
+                       print_overflow_msg(__func__, xdr);
+                       return -EIO;
+               }
+               *res = be32_to_cpup(p);
+               bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
+       }
+       return 0;
+}
+
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
        __be32 *savep;
-       uint32_t attrlen, bitmap[2];
+       uint32_t attrlen, bitmap[3];
        int status;
 
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4420,6 +4539,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
        status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
        if (status != 0)
                goto xdr_error;
+       status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
+       if (status)
+               goto xdr_error;
 
        status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
@@ -4839,7 +4961,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
 {
        __be32 *savep;
        uint32_t attrlen,
-                bitmap[2] = {0};
+                bitmap[3] = {0};
        struct kvec *iov = req->rq_rcv_buf.head;
        int status;
 
@@ -5268,6 +5390,53 @@ out_overflow:
 }
 
 #if defined(CONFIG_NFS_V4_1)
+/*
+ * TODO: Need to handle case when EOF != true;
+ */
+static int decode_getdevicelist(struct xdr_stream *xdr,
+                               struct pnfs_devicelist *res)
+{
+       __be32 *p;
+       int status, i;
+       struct nfs_writeverf verftemp;
+
+       status = decode_op_hdr(xdr, OP_GETDEVICELIST);
+       if (status)
+               return status;
+
+       p = xdr_inline_decode(xdr, 8 + 8 + 4);
+       if (unlikely(!p))
+               goto out_overflow;
+
+       /* TODO: Skip cookie for now */
+       p += 2;
+
+       /* Read verifier */
+       p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
+
+       res->num_devs = be32_to_cpup(p);
+
+       dprintk("%s: num_dev %d\n", __func__, res->num_devs);
+
+       if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
+               printk(KERN_ERR "%s too many result dev_num %u\n",
+                               __func__, res->num_devs);
+               return -EIO;
+       }
+
+       p = xdr_inline_decode(xdr,
+                             res->num_devs * NFS4_DEVICEID4_SIZE + 4);
+       if (unlikely(!p))
+               goto out_overflow;
+       for (i = 0; i < res->num_devs; i++)
+               p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
+                                           NFS4_DEVICEID4_SIZE);
+       res->eof = be32_to_cpup(p);
+       return 0;
+out_overflow:
+       print_overflow_msg(__func__, xdr);
+       return -EIO;
+}
 
 static int decode_getdeviceinfo(struct xdr_stream *xdr,
                                struct pnfs_device *pdev)
@@ -5430,6 +5599,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
        int status;
 
        status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
+       res->status = status;
        if (status)
                return status;
 
@@ -6541,6 +6711,32 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
        return status;
 }
 
+/*
+ * Decode GETDEVICELIST response
+ */
+static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_getdevicelist_res *res)
+{
+       struct compound_hdr hdr;
+       int status;
+
+       dprintk("encoding getdevicelist!\n");
+
+       status = decode_compound_hdr(xdr, &hdr);
+       if (status != 0)
+               goto out;
+       status = decode_sequence(xdr, &res->seq_res, rqstp);
+       if (status != 0)
+               goto out;
+       status = decode_putfh(xdr);
+       if (status != 0)
+               goto out;
+       status = decode_getdevicelist(xdr, res->devlist);
+out:
+       return status;
+}
+
 /*
  * Decode GETDEVINFO response
  */
@@ -6722,7 +6918,7 @@ out:
 int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
                       int plus)
 {
-       uint32_t bitmap[2] = {0};
+       uint32_t bitmap[3] = {0};
        uint32_t len;
        __be32 *p = xdr_inline_decode(xdr, 4);
        if (unlikely(!p))
@@ -6908,6 +7104,7 @@ struct rpc_procinfo       nfs4_procedures[] = {
        PROC(SECINFO_NO_NAME,   enc_secinfo_no_name,    dec_secinfo_no_name),
        PROC(TEST_STATEID,      enc_test_stateid,       dec_test_stateid),
        PROC(FREE_STATEID,      enc_free_stateid,       dec_free_stateid),
+       PROC(GETDEVICELIST,     enc_getdevicelist,      dec_getdevicelist),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
index 38e5508..e550e88 100644 (file)
@@ -76,8 +76,11 @@ find_pnfs_driver(u32 id)
 void
 unset_pnfs_layoutdriver(struct nfs_server *nfss)
 {
-       if (nfss->pnfs_curr_ld)
+       if (nfss->pnfs_curr_ld) {
+               if (nfss->pnfs_curr_ld->clear_layoutdriver)
+                       nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
                module_put(nfss->pnfs_curr_ld->owner);
+       }
        nfss->pnfs_curr_ld = NULL;
 }
 
@@ -88,7 +91,8 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss)
  * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
  */
 void
-set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
+set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
+                     u32 id)
 {
        struct pnfs_layoutdriver_type *ld_type = NULL;
 
@@ -115,6 +119,13 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
                goto out_no_driver;
        }
        server->pnfs_curr_ld = ld_type;
+       if (ld_type->set_layoutdriver
+           && ld_type->set_layoutdriver(server, mntfh)) {
+               printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n",
+                               __func__, id);
+               module_put(ld_type->owner);
+               goto out_no_driver;
+       }
 
        dprintk("%s: pNFS module for %u set\n", __func__, id);
        return;
@@ -190,6 +201,7 @@ static void
 pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
 {
        struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
+       put_rpccred(lo->plh_lc_cred);
        return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
 }
 
@@ -224,6 +236,7 @@ static void
 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
 {
        INIT_LIST_HEAD(&lseg->pls_list);
+       INIT_LIST_HEAD(&lseg->pls_lc_list);
        atomic_set(&lseg->pls_refcount, 1);
        smp_mb();
        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
@@ -816,7 +829,9 @@ out:
 }
 
 static struct pnfs_layout_hdr *
-alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
+alloc_init_layout_hdr(struct inode *ino,
+                     struct nfs_open_context *ctx,
+                     gfp_t gfp_flags)
 {
        struct pnfs_layout_hdr *lo;
 
@@ -828,11 +843,14 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
        INIT_LIST_HEAD(&lo->plh_segs);
        INIT_LIST_HEAD(&lo->plh_bulk_recall);
        lo->plh_inode = ino;
+       lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
        return lo;
 }
 
 static struct pnfs_layout_hdr *
-pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
+pnfs_find_alloc_layout(struct inode *ino,
+                      struct nfs_open_context *ctx,
+                      gfp_t gfp_flags)
 {
        struct nfs_inode *nfsi = NFS_I(ino);
        struct pnfs_layout_hdr *new = NULL;
@@ -847,7 +865,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
                        return nfsi->layout;
        }
        spin_unlock(&ino->i_lock);
-       new = alloc_init_layout_hdr(ino, gfp_flags);
+       new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
        spin_lock(&ino->i_lock);
 
        if (likely(nfsi->layout == NULL))       /* Won the race? */
@@ -940,7 +958,7 @@ pnfs_update_layout(struct inode *ino,
        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
                return NULL;
        spin_lock(&ino->i_lock);
-       lo = pnfs_find_alloc_layout(ino, gfp_flags);
+       lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
        if (lo == NULL) {
                dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
                goto out_unlock;
@@ -1350,16 +1368,17 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
 
 /*
- * Currently there is only one (whole file) write lseg.
+ * There can be multiple RW segments.
  */
-static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode)
+static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
 {
-       struct pnfs_layout_segment *lseg, *rv = NULL;
+       struct pnfs_layout_segment *lseg;
 
-       list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
-               if (lseg->pls_range.iomode == IOMODE_RW)
-                       rv = lseg;
-       return rv;
+       list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
+               if (lseg->pls_range.iomode == IOMODE_RW &&
+                   test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+                       list_add(&lseg->pls_lc_list, listp);
+       }
 }
 
 void
@@ -1371,17 +1390,19 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 
        spin_lock(&nfsi->vfs_inode.i_lock);
        if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
-               /* references matched in nfs4_layoutcommit_release */
-               get_lseg(wdata->lseg);
-               wdata->lseg->pls_lc_cred =
-                       get_rpccred(wdata->args.context->state->owner->so_cred);
                mark_as_dirty = true;
                dprintk("%s: Set layoutcommit for inode %lu ",
                        __func__, wdata->inode->i_ino);
        }
-       if (end_pos > wdata->lseg->pls_end_pos)
-               wdata->lseg->pls_end_pos = end_pos;
+       if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) {
+               /* references matched in nfs4_layoutcommit_release */
+               get_lseg(wdata->lseg);
+       }
+       if (end_pos > nfsi->layout->plh_lwb)
+               nfsi->layout->plh_lwb = end_pos;
        spin_unlock(&nfsi->vfs_inode.i_lock);
+       dprintk("%s: lseg %p end_pos %llu\n",
+               __func__, wdata->lseg, nfsi->layout->plh_lwb);
 
        /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
         * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
@@ -1390,6 +1411,14 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 }
 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
 
+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
+{
+       struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+
+       if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
+               nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
+}
+
 /*
  * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
  * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
@@ -1403,8 +1432,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
        struct nfs4_layoutcommit_data *data;
        struct nfs_inode *nfsi = NFS_I(inode);
-       struct pnfs_layout_segment *lseg;
-       struct rpc_cred *cred;
        loff_t end_pos;
        int status = 0;
 
@@ -1421,30 +1448,25 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
                goto out;
        }
 
+       INIT_LIST_HEAD(&data->lseg_list);
        spin_lock(&inode->i_lock);
        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
                spin_unlock(&inode->i_lock);
                kfree(data);
                goto out;
        }
-       /*
-        * Currently only one (whole file) write lseg which is referenced
-        * in pnfs_set_layoutcommit and will be found.
-        */
-       lseg = pnfs_list_write_lseg(inode);
 
-       end_pos = lseg->pls_end_pos;
-       cred = lseg->pls_lc_cred;
-       lseg->pls_end_pos = 0;
-       lseg->pls_lc_cred = NULL;
+       pnfs_list_write_lseg(inode, &data->lseg_list);
+
+       end_pos = nfsi->layout->plh_lwb;
+       nfsi->layout->plh_lwb = 0;
 
        memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
                sizeof(nfsi->layout->plh_stateid.data));
        spin_unlock(&inode->i_lock);
 
        data->args.inode = inode;
-       data->lseg = lseg;
-       data->cred = cred;
+       data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
        nfs_fattr_init(&data->fattr);
        data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
        data->res.fattr = &data->fattr;
index 078670d..e0b5d80 100644 (file)
 enum {
        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
        NFS_LSEG_ROC,           /* roc bit received from server */
+       NFS_LSEG_LAYOUTCOMMIT,  /* layoutcommit bit set for layoutcommit */
 };
 
 struct pnfs_layout_segment {
        struct list_head pls_list;
+       struct list_head pls_lc_list;
        struct pnfs_layout_range pls_range;
        atomic_t pls_refcount;
        unsigned long pls_flags;
        struct pnfs_layout_hdr *pls_layout;
-       struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */
-       loff_t pls_end_pos; /* LAYOUTCOMMIT write end */
 };
 
 enum pnfs_try_status {
@@ -80,6 +80,9 @@ struct pnfs_layoutdriver_type {
        struct module *owner;
        unsigned flags;
 
+       int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
+       int (*clear_layoutdriver) (struct nfs_server *);
+
        struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
        void (*free_layout_hdr) (struct pnfs_layout_hdr *);
 
@@ -110,6 +113,8 @@ struct pnfs_layoutdriver_type {
                                     struct xdr_stream *xdr,
                                     const struct nfs4_layoutreturn_args *args);
 
+       void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
+
        void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
                                     struct xdr_stream *xdr,
                                     const struct nfs4_layoutcommit_args *args);
@@ -125,6 +130,8 @@ struct pnfs_layout_hdr {
        unsigned long           plh_block_lgets; /* block LAYOUTGET if >0 */
        u32                     plh_barrier; /* ignore lower seqids */
        unsigned long           plh_flags;
+       loff_t                  plh_lwb; /* last write byte for layoutcommit */
+       struct rpc_cred         *plh_lc_cred; /* layoutcommit cred */
        struct inode            *plh_inode;
 };
 
@@ -137,10 +144,21 @@ struct pnfs_device {
        unsigned int  pglen;
 };
 
+#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
+
+struct pnfs_devicelist {
+       unsigned int            eof;
+       unsigned int            num_devs;
+       struct nfs4_deviceid    dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
+};
+
 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 
 /* nfs4proc.c */
+extern int nfs4_proc_getdevicelist(struct nfs_server *server,
+                                  const struct nfs_fh *fh,
+                                  struct pnfs_devicelist *devlist);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
                                   struct pnfs_device *dev);
 extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
@@ -153,7 +171,7 @@ void put_lseg(struct pnfs_layout_segment *lseg);
 bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
 bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int);
 
-void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
 int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
@@ -179,6 +197,7 @@ void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
 int pnfs_ld_write_done(struct nfs_write_data *);
@@ -360,7 +379,8 @@ pnfs_roc_drain(struct inode *ino, u32 *barrier)
        return false;
 }
 
-static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
+static inline void set_pnfs_layoutdriver(struct nfs_server *s,
+                                        const struct nfs_fh *mntfh, u32 id);
 {
 }
 
index f387919..8c6ee44 100644 (file)
@@ -29,6 +29,8 @@
 #define NFS_MNT_VERSION                1
 #define NFS_MNT3_VERSION       3
 
+#define NFS_PIPE_DIRNAME "/nfs"
+
 /*
  * NFS stats. The good thing with these values is that NFSv3 errors are
  * a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which
index a3c4bc8..76f99e8 100644 (file)
@@ -566,6 +566,7 @@ enum {
        NFSPROC4_CLNT_SECINFO_NO_NAME,
        NFSPROC4_CLNT_TEST_STATEID,
        NFSPROC4_CLNT_FREE_STATEID,
+       NFSPROC4_CLNT_GETDEVICELIST,
 };
 
 /* nfs41 types */
index 8b579be..b96fb99 100644 (file)
@@ -99,9 +99,10 @@ struct nfs_open_context {
 
 struct nfs_open_dir_context {
        struct rpc_cred *cred;
+       unsigned long attr_gencount;
        __u64 dir_cookie;
        __u64 dup_cookie;
-       int duped;
+       signed char duped;
 };
 
 /*
index 50a661f..82fdfc7 100644 (file)
@@ -132,7 +132,7 @@ struct nfs_server {
 #endif
 
 #ifdef CONFIG_NFS_V4
-       u32                     attr_bitmask[2];/* V4 bitmask representing the set
+       u32                     attr_bitmask[3];/* V4 bitmask representing the set
                                                   of attributes supported on this
                                                   filesystem */
        u32                     cache_consistency_bitmask[2];
@@ -145,6 +145,8 @@ struct nfs_server {
                                                   filesystem */
        struct pnfs_layoutdriver_type  *pnfs_curr_ld; /* Active layout driver */
        struct rpc_wait_queue   roc_rpcwaitq;
+       u32                     pnfs_blksize;   /* layout_blksize attr */
+       void                    *pnfs_ld_data;  /* per mount point data */
 
        /* the following fields are protected by nfs_client->cl_lock */
        struct rb_root          state_owners;
index 5b11595..569ea5b 100644 (file)
@@ -122,6 +122,7 @@ struct nfs_fsinfo {
        struct timespec         time_delta; /* server time granularity */
        __u32                   lease_time; /* in seconds */
        __u32                   layouttype; /* supported pnfs layout driver */
+       __u32                   blksize; /* preferred pnfs io block size */
 };
 
 struct nfs_fsstat {
@@ -235,6 +236,17 @@ struct nfs4_layoutget {
        gfp_t gfp_flags;
 };
 
+struct nfs4_getdevicelist_args {
+       const struct nfs_fh *fh;
+       u32 layoutclass;
+       struct nfs4_sequence_args seq_args;
+};
+
+struct nfs4_getdevicelist_res {
+       struct pnfs_devicelist *devlist;
+       struct nfs4_sequence_res seq_res;
+};
+
 struct nfs4_getdeviceinfo_args {
        struct pnfs_device *pdev;
        struct nfs4_sequence_args seq_args;
@@ -257,12 +269,13 @@ struct nfs4_layoutcommit_res {
        struct nfs_fattr *fattr;
        const struct nfs_server *server;
        struct nfs4_sequence_res seq_res;
+       int status;
 };
 
 struct nfs4_layoutcommit_data {
        struct rpc_task task;
        struct nfs_fattr fattr;
-       struct pnfs_layout_segment *lseg;
+       struct list_head lseg_list;
        struct rpc_cred *cred;
        struct nfs4_layoutcommit_args args;
        struct nfs4_layoutcommit_res res;
@@ -943,7 +956,7 @@ struct nfs4_server_caps_arg {
 };
 
 struct nfs4_server_caps_res {
-       u32                             attr_bitmask[2];
+       u32                             attr_bitmask[3];
        u32                             acl_bitmask;
        u32                             has_links;
        u32                             has_symlinks;