Merge branch 'pnfs-submit' of git://git.open-osd.org/linux-open-osd
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 29 May 2011 21:10:13 +0000 (14:10 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 29 May 2011 21:10:13 +0000 (14:10 -0700)
* 'pnfs-submit' of git://git.open-osd.org/linux-open-osd: (32 commits)
  pnfs-obj: pg_test check for max_io_size
  NFSv4.1: define nfs_generic_pg_test
  NFSv4.1: use pnfs_generic_pg_test directly by layout driver
  NFSv4.1: change pg_test return type to bool
  NFSv4.1: unify pnfs_pageio_init functions
  pnfs-obj: objlayout_encode_layoutcommit implementation
  pnfs: encode_layoutcommit
  pnfs-obj: report errors and .encode_layoutreturn Implementation.
  pnfs: encode_layoutreturn
  pnfs: layoutret_on_setattr
  pnfs: layoutreturn
  pnfs-obj: osd raid engine read/write implementation
  pnfs: support for non-rpc layout drivers
  pnfs-obj: define per-inode private structure
  pnfs: alloc and free layout_hdr layoutdriver methods
  pnfs-obj: objio_osd device information retrieval and caching
  pnfs-obj: decode layout, alloc/free lseg
  pnfs-obj: pnfs_osd XDR client implementation
  pnfs-obj: pnfs_osd XDR definitions
  pnfs-obj: objlayoutdriver module skeleton
  ...

32 files changed:
fs/nfs/Kconfig
fs/nfs/Makefile
fs/nfs/callback.h
fs/nfs/callback_proc.c
fs/nfs/callback_xdr.c
fs/nfs/client.c
fs/nfs/dir.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/nfs4filelayout.c
fs/nfs/nfs4filelayout.h
fs/nfs/nfs4filelayoutdev.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4xdr.c
fs/nfs/objlayout/Kbuild [new file with mode: 0644]
fs/nfs/objlayout/objio_osd.c [new file with mode: 0644]
fs/nfs/objlayout/objlayout.c [new file with mode: 0644]
fs/nfs/objlayout/objlayout.h [new file with mode: 0644]
fs/nfs/objlayout/pnfs_osd_xdr_cli.c [new file with mode: 0644]
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/pnfs_dev.c [new file with mode: 0644]
fs/nfs/read.c
fs/nfs/super.c
fs/nfs/write.c
include/linux/nfs4.h
include/linux/nfs_page.h
include/linux/nfs_xdr.h
include/linux/pnfs_osd_xdr.h [new file with mode: 0644]
include/linux/sunrpc/xdr.h
net/sunrpc/xdr.c

index ba30665..8151554 100644 (file)
@@ -87,6 +87,16 @@ config NFS_V4_1
 config PNFS_FILE_LAYOUT
        tristate
 
+config PNFS_OBJLAYOUT
+       tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
+       depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
+       help
+         Say M here if you want your pNFS client to support the Objects Layout Driver.
+         Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
+         upper level driver (SCSI_OSD_ULD).
+
+         If unsure, say N.
+
 config ROOT_NFS
        bool "Root file system on NFS"
        depends on NFS_FS=y && IP_PNP
index 4776ff9..6a34f7d 100644 (file)
@@ -15,9 +15,11 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
                           delegation.o idmap.o \
                           callback.o callback_xdr.o callback_proc.o \
                           nfs4namespace.o
-nfs-$(CONFIG_NFS_V4_1) += pnfs.o
+nfs-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
 
 obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
 nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
+
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
index 46d93ce..b257383 100644 (file)
@@ -167,6 +167,23 @@ extern unsigned nfs4_callback_layoutrecall(
 
 extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
 extern void nfs4_cb_take_slot(struct nfs_client *clp);
+
+struct cb_devicenotifyitem {
+       uint32_t                cbd_notify_type;
+       uint32_t                cbd_layout_type;
+       struct nfs4_deviceid    cbd_dev_id;
+       uint32_t                cbd_immediate;
+};
+
+struct cb_devicenotifyargs {
+       int                              ndevs;
+       struct cb_devicenotifyitem       *devs;
+};
+
+extern __be32 nfs4_callback_devicenotify(
+       struct cb_devicenotifyargs *args,
+       void *dummy, struct cb_process_state *cps);
+
 #endif /* CONFIG_NFS_V4_1 */
 extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
index 2f41dcc..d4d1954 100644 (file)
@@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
        spin_lock(&ino->i_lock);
        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
            mark_matching_lsegs_invalid(lo, &free_me_list,
-                                       args->cbl_range.iomode))
+                                       &args->cbl_range))
                rv = NFS4ERR_DELAY;
        else
                rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -184,7 +184,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
                ino = lo->plh_inode;
                spin_lock(&ino->i_lock);
                set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
-               if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+               if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
                        rv = NFS4ERR_DELAY;
                list_del_init(&lo->plh_bulk_recall);
                spin_unlock(&ino->i_lock);
@@ -241,6 +241,53 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
        do_callback_layoutrecall(clp, &args);
 }
 
+__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
+                                 void *dummy, struct cb_process_state *cps)
+{
+       int i;
+       __be32 res = 0;
+       struct nfs_client *clp = cps->clp;
+       struct nfs_server *server = NULL;
+
+       dprintk("%s: -->\n", __func__);
+
+       if (!clp) {
+               res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+               goto out;
+       }
+
+       for (i = 0; i < args->ndevs; i++) {
+               struct cb_devicenotifyitem *dev = &args->devs[i];
+
+               if (!server ||
+                   server->pnfs_curr_ld->id != dev->cbd_layout_type) {
+                       rcu_read_lock();
+                       list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                               if (server->pnfs_curr_ld &&
+                                   server->pnfs_curr_ld->id == dev->cbd_layout_type) {
+                                       rcu_read_unlock();
+                                       goto found;
+                               }
+                       rcu_read_unlock();
+                       dprintk("%s: layout type %u not found\n",
+                               __func__, dev->cbd_layout_type);
+                       continue;
+               }
+
+       found:
+               if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
+                       dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
+                               "deleting instead\n", __func__);
+               nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
+       }
+
+out:
+       kfree(args->devs);
+       dprintk("%s: exit with status = %u\n",
+               __func__, be32_to_cpu(res));
+       return res;
+}
+
 int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
 {
        if (delegation == NULL)
index 00ecf62..c6c86a7 100644 (file)
@@ -25,6 +25,7 @@
 
 #if defined(CONFIG_NFS_V4_1)
 #define CB_OP_LAYOUTRECALL_RES_MAXSZ   (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_DEVICENOTIFY_RES_MAXSZ   (CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_SEQUENCE_RES_MAXSZ       (CB_OP_HDR_RES_MAXSZ + \
                                        4 + 1 + 3)
 #define CB_OP_RECALLANY_RES_MAXSZ      (CB_OP_HDR_RES_MAXSZ)
@@ -284,6 +285,93 @@ out:
        return status;
 }
 
+static
+__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
+                               struct xdr_stream *xdr,
+                               struct cb_devicenotifyargs *args)
+{
+       __be32 *p;
+       __be32 status = 0;
+       u32 tmp;
+       int n, i;
+       args->ndevs = 0;
+
+       /* Num of device notifications */
+       p = read_buf(xdr, sizeof(uint32_t));
+       if (unlikely(p == NULL)) {
+               status = htonl(NFS4ERR_BADXDR);
+               goto out;
+       }
+       n = ntohl(*p++);
+       if (n <= 0)
+               goto out;
+
+       args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
+       if (!args->devs) {
+               status = htonl(NFS4ERR_DELAY);
+               goto out;
+       }
+
+       /* Decode each dev notification */
+       for (i = 0; i < n; i++) {
+               struct cb_devicenotifyitem *dev = &args->devs[i];
+
+               p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+               if (unlikely(p == NULL)) {
+                       status = htonl(NFS4ERR_BADXDR);
+                       goto err;
+               }
+
+               tmp = ntohl(*p++);      /* bitmap size */
+               if (tmp != 1) {
+                       status = htonl(NFS4ERR_INVAL);
+                       goto err;
+               }
+               dev->cbd_notify_type = ntohl(*p++);
+               if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
+                   dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
+                       status = htonl(NFS4ERR_INVAL);
+                       goto err;
+               }
+
+               tmp = ntohl(*p++);      /* opaque size */
+               if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
+                    (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
+                   ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
+                    (tmp != NFS4_DEVICEID4_SIZE + 4))) {
+                       status = htonl(NFS4ERR_INVAL);
+                       goto err;
+               }
+               dev->cbd_layout_type = ntohl(*p++);
+               memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
+               p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+               if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
+                       p = read_buf(xdr, sizeof(uint32_t));
+                       if (unlikely(p == NULL)) {
+                               status = htonl(NFS4ERR_BADXDR);
+                               goto err;
+                       }
+                       dev->cbd_immediate = ntohl(*p++);
+               } else {
+                       dev->cbd_immediate = 0;
+               }
+
+               args->ndevs++;
+
+               dprintk("%s: type %d layout 0x%x immediate %d\n",
+                       __func__, dev->cbd_notify_type, dev->cbd_layout_type,
+                       dev->cbd_immediate);
+       }
+out:
+       dprintk("%s: status %d ndevs %d\n",
+               __func__, ntohl(status), args->ndevs);
+       return status;
+err:
+       kfree(args->devs);
+       goto out;
+}
+
 static __be32 decode_sessionid(struct xdr_stream *xdr,
                                 struct nfs4_sessionid *sid)
 {
@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        case OP_CB_RECALL_ANY:
        case OP_CB_RECALL_SLOT:
        case OP_CB_LAYOUTRECALL:
+       case OP_CB_NOTIFY_DEVICEID:
                *op = &callback_ops[op_nr];
                break;
 
-       case OP_CB_NOTIFY_DEVICEID:
        case OP_CB_NOTIFY:
        case OP_CB_PUSH_DELEG:
        case OP_CB_RECALLABLE_OBJ_AVAIL:
@@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = {
                        (callback_decode_arg_t)decode_layoutrecall_args,
                .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
        },
+       [OP_CB_NOTIFY_DEVICEID] = {
+               .process_op = (callback_process_op_t)nfs4_callback_devicenotify,
+               .decode_args =
+                       (callback_decode_arg_t)decode_devicenotify_args,
+               .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
+       },
        [OP_CB_SEQUENCE] = {
                .process_op = (callback_process_op_t)nfs4_callback_sequence,
                .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
index 139be96..b3dc2b8 100644 (file)
@@ -290,6 +290,8 @@ static void nfs_free_client(struct nfs_client *clp)
        if (clp->cl_machine_cred != NULL)
                put_rpccred(clp->cl_machine_cred);
 
+       nfs4_deviceid_purge_client(clp);
+
        kfree(clp->cl_hostname);
        kfree(clp);
 
index 424e477..ededdbd 100644 (file)
@@ -512,12 +512,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
                                struct page **xdr_pages, struct page *page, unsigned int buflen)
 {
        struct xdr_stream stream;
-       struct xdr_buf buf = {
-               .pages = xdr_pages,
-               .page_len = buflen,
-               .buflen = buflen,
-               .len = buflen,
-       };
+       struct xdr_buf buf;
        struct page *scratch;
        struct nfs_cache_array *array;
        unsigned int count = 0;
@@ -527,7 +522,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
        if (scratch == NULL)
                return -ENOMEM;
 
-       xdr_init_decode(&stream, &buf, NULL);
+       xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 
        do {
index 873c6fa..144f2a3 100644 (file)
@@ -1428,9 +1428,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  */
 void nfs4_evict_inode(struct inode *inode)
 {
-       pnfs_destroy_layout(NFS_I(inode));
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
+       pnfs_return_layout(inode);
+       pnfs_destroy_layout(NFS_I(inode));
        /* If we are holding a delegation, return it! */
        nfs_inode_return_delegation_noreclaim(inode);
        /* First call standard NFS clear_inode() code */
index 2df6ca7..b9056cb 100644 (file)
@@ -310,6 +310,7 @@ extern int nfs_migrate_page(struct address_space *,
 #endif
 
 /* nfs4proc.c */
+extern void __nfs4_read_done_cb(struct nfs_read_data *);
 extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
 extern int nfs4_init_client(struct nfs_client *clp,
                            const struct rpc_timeout *timeparms,
index be79dc9..4269088 100644 (file)
@@ -421,6 +421,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
                        struct nfs4_deviceid *id,
                        gfp_t gfp_flags)
 {
+       struct nfs4_deviceid_node *d;
        struct nfs4_file_layout_dsaddr *dsaddr;
        int status = -EINVAL;
        struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
@@ -428,7 +429,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        dprintk("--> %s\n", __func__);
 
        if (fl->pattern_offset > lgr->range.offset) {
-               dprintk("%s pattern_offset %lld to large\n",
+               dprintk("%s pattern_offset %lld too large\n",
                                __func__, fl->pattern_offset);
                goto out;
        }
@@ -440,12 +441,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        }
 
        /* find and reference the deviceid */
-       dsaddr = nfs4_fl_find_get_deviceid(id);
-       if (dsaddr == NULL) {
+       d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
+                                  NFS_SERVER(lo->plh_inode)->nfs_client, id);
+       if (d == NULL) {
                dsaddr = get_device_info(lo->plh_inode, id, gfp_flags);
                if (dsaddr == NULL)
                        goto out;
-       }
+       } else
+               dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
        fl->dsaddr = dsaddr;
 
        if (fl->first_stripe_index < 0 ||
@@ -507,12 +510,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
                         gfp_t gfp_flags)
 {
        struct xdr_stream stream;
-       struct xdr_buf buf = {
-               .pages =  lgr->layoutp->pages,
-               .page_len =  lgr->layoutp->len,
-               .buflen =  lgr->layoutp->len,
-               .len = lgr->layoutp->len,
-       };
+       struct xdr_buf buf;
        struct page *scratch;
        __be32 *p;
        uint32_t nfl_util;
@@ -524,7 +522,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
        if (!scratch)
                return -ENOMEM;
 
-       xdr_init_decode(&stream, &buf, NULL);
+       xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 
        /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
@@ -535,7 +533,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
 
        memcpy(id, p, sizeof(*id));
        p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
-       print_deviceid(id);
+       nfs4_print_deviceid(id);
 
        nfl_util = be32_to_cpup(p++);
        if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
@@ -653,16 +651,19 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 /*
  * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
  *
- * return 1 :  coalesce page
- * return 0 :  don't coalesce page
+ * return true  : coalesce page
+ * return false : don't coalesce page
  */
-int
+bool
 filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
                   struct nfs_page *req)
 {
        u64 p_stripe, r_stripe;
        u32 stripe_unit;
 
+       if (!pnfs_generic_pg_test(pgio, prev, req))
+               return 0;
+
        if (!pgio->pg_lseg)
                return 1;
        p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
@@ -860,6 +861,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
        return -ENOMEM;
 }
 
+static void
+filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+{
+       nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
+}
+
 static struct pnfs_layoutdriver_type filelayout_type = {
        .id                     = LAYOUT_NFSV4_1_FILES,
        .name                   = "LAYOUT_NFSV4_1_FILES",
@@ -872,6 +879,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
        .commit_pagelist        = filelayout_commit_pagelist,
        .read_pagelist          = filelayout_read_pagelist,
        .write_pagelist         = filelayout_write_pagelist,
+       .free_deviceid_node     = filelayout_free_deveiceid_node,
 };
 
 static int __init nfs4filelayout_init(void)
index 2b461d7..cebe01e 100644 (file)
@@ -59,9 +59,7 @@ struct nfs4_pnfs_ds {
 #define NFS4_DEVICE_ID_NEG_ENTRY       0x00000001
 
 struct nfs4_file_layout_dsaddr {
-       struct hlist_node               node;
-       struct nfs4_deviceid            deviceid;
-       atomic_t                        ref;
+       struct nfs4_deviceid_node       id_node;
        unsigned long                   flags;
        u32                             stripe_count;
        u8                              *stripe_indices;
@@ -95,14 +93,12 @@ extern struct nfs_fh *
 nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
 
 extern void print_ds(struct nfs4_pnfs_ds *ds);
-extern void print_deviceid(struct nfs4_deviceid *dev_id);
 u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
 u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
 struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
                                        u32 ds_idx);
-extern struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
 extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
+extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
 
index db07c7a..3b7bf13 100644 (file)
 
 #define NFSDBG_FACILITY                NFSDBG_PNFS_LD
 
-/*
- * Device ID RCU cache. A device ID is unique per client ID and layout type.
- */
-#define NFS4_FL_DEVICE_ID_HASH_BITS    5
-#define NFS4_FL_DEVICE_ID_HASH_SIZE    (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
-#define NFS4_FL_DEVICE_ID_HASH_MASK    (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
-
-static inline u32
-nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
-{
-       unsigned char *cptr = (unsigned char *)id->data;
-       unsigned int nbytes = NFS4_DEVICEID4_SIZE;
-       u32 x = 0;
-
-       while (nbytes--) {
-               x *= 37;
-               x += *cptr++;
-       }
-       return x & NFS4_FL_DEVICE_ID_HASH_MASK;
-}
-
-static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
-static DEFINE_SPINLOCK(filelayout_deviceid_lock);
-
 /*
  * Data server cache
  *
@@ -89,27 +65,6 @@ print_ds(struct nfs4_pnfs_ds *ds)
                ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
 }
 
-void
-print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
-{
-       int i;
-
-       ifdebug(FACILITY) {
-               printk("%s dsaddr->ds_num %d\n", __func__,
-                      dsaddr->ds_num);
-               for (i = 0; i < dsaddr->ds_num; i++)
-                       print_ds(dsaddr->ds_list[i]);
-       }
-}
-
-void print_deviceid(struct nfs4_deviceid *id)
-{
-       u32 *p = (u32 *)id;
-
-       dprintk("%s: device id= [%x%x%x%x]\n", __func__,
-               p[0], p[1], p[2], p[3]);
-}
-
 /* nfs4_ds_cache_lock is held */
 static struct nfs4_pnfs_ds *
 _data_server_lookup_locked(u32 ip_addr, u32 port)
@@ -201,13 +156,13 @@ destroy_ds(struct nfs4_pnfs_ds *ds)
        kfree(ds);
 }
 
-static void
+void
 nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
        struct nfs4_pnfs_ds *ds;
        int i;
 
-       print_deviceid(&dsaddr->deviceid);
+       nfs4_print_deviceid(&dsaddr->id_node.deviceid);
 
        for (i = 0; i < dsaddr->ds_num; i++) {
                ds = dsaddr->ds_list[i];
@@ -353,12 +308,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
        u8 max_stripe_index;
        struct nfs4_file_layout_dsaddr *dsaddr = NULL;
        struct xdr_stream stream;
-       struct xdr_buf buf = {
-               .pages = pdev->pages,
-               .page_len = pdev->pglen,
-               .buflen = pdev->pglen,
-               .len = pdev->pglen,
-       };
+       struct xdr_buf buf;
        struct page *scratch;
 
        /* set up xdr stream */
@@ -366,7 +316,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
        if (!scratch)
                goto out_err;
 
-       xdr_init_decode(&stream, &buf, NULL);
+       xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 
        /* Get the stripe count (number of stripe index) */
@@ -431,8 +381,10 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
        dsaddr->stripe_indices = stripe_indices;
        stripe_indices = NULL;
        dsaddr->ds_num = num;
-
-       memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
+       nfs4_init_deviceid_node(&dsaddr->id_node,
+                               NFS_SERVER(ino)->pnfs_curr_ld,
+                               NFS_SERVER(ino)->nfs_client,
+                               &pdev->dev_id);
 
        for (i = 0; i < dsaddr->ds_num; i++) {
                int j;
@@ -505,8 +457,8 @@ out_err:
 static struct nfs4_file_layout_dsaddr *
 decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
 {
-       struct nfs4_file_layout_dsaddr *d, *new;
-       long hash;
+       struct nfs4_deviceid_node *d;
+       struct nfs4_file_layout_dsaddr *n, *new;
 
        new = decode_device(inode, dev, gfp_flags);
        if (!new) {
@@ -515,20 +467,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
                return NULL;
        }
 
-       spin_lock(&filelayout_deviceid_lock);
-       d = nfs4_fl_find_get_deviceid(&new->deviceid);
-       if (d) {
-               spin_unlock(&filelayout_deviceid_lock);
+       d = nfs4_insert_deviceid_node(&new->id_node);
+       n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+       if (n != new) {
                nfs4_fl_free_deviceid(new);
-               return d;
+               return n;
        }
 
-       INIT_HLIST_NODE(&new->node);
-       atomic_set(&new->ref, 1);
-       hash = nfs4_fl_deviceid_hash(&new->deviceid);
-       hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
-       spin_unlock(&filelayout_deviceid_lock);
-
        return new;
 }
 
@@ -600,35 +545,7 @@ out_free:
 void
 nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
-       if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
-               hlist_del_rcu(&dsaddr->node);
-               spin_unlock(&filelayout_deviceid_lock);
-
-               synchronize_rcu();
-               nfs4_fl_free_deviceid(dsaddr);
-       }
-}
-
-struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
-{
-       struct nfs4_file_layout_dsaddr *d;
-       struct hlist_node *n;
-       long hash = nfs4_fl_deviceid_hash(id);
-
-
-       rcu_read_lock();
-       hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
-               if (!memcmp(&d->deviceid, id, sizeof(*id))) {
-                       if (!atomic_inc_not_zero(&d->ref))
-                               goto fail;
-                       rcu_read_unlock();
-                       return d;
-               }
-       }
-fail:
-       rcu_read_unlock();
-       return NULL;
+       nfs4_put_deviceid_node(&dsaddr->id_node);
 }
 
 /*
@@ -676,15 +593,15 @@ static void
 filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
                               int err, u32 ds_addr)
 {
-       u32 *p = (u32 *)&dsaddr->deviceid;
+       u32 *p = (u32 *)&dsaddr->id_node.deviceid;
 
        printk(KERN_ERR "NFS: data server %x connection error %d."
                " Deviceid [%x%x%x%x] marked out of use.\n",
                ds_addr, err, p[0], p[1], p[2], p[3]);
 
-       spin_lock(&filelayout_deviceid_lock);
+       spin_lock(&nfs4_ds_cache_lock);
        dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
-       spin_unlock(&filelayout_deviceid_lock);
+       spin_unlock(&nfs4_ds_cache_lock);
 }
 
 struct nfs4_pnfs_ds *
index d0e15db..d2c4b59 100644 (file)
@@ -2363,6 +2363,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
        struct nfs4_state *state = NULL;
        int status;
 
+       if (pnfs_ld_layoutret_on_setattr(inode))
+               pnfs_return_layout(inode);
+
        nfs_fattr_init(fattr);
        
        /* Search for an existing open(O_WRITE) file */
@@ -3177,6 +3180,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return err;
 }
 
+void __nfs4_read_done_cb(struct nfs_read_data *data)
+{
+       nfs_invalidate_atime(data->inode);
+}
+
 static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
@@ -3186,7 +3194,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
                return -EAGAIN;
        }
 
-       nfs_invalidate_atime(data->inode);
+       __nfs4_read_done_cb(data);
        if (task->tk_status > 0)
                renew_lease(server, data->timestamp);
        return 0;
@@ -3200,7 +3208,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
        if (!nfs4_sequence_done(task, &data->res.seq_res))
                return -EAGAIN;
 
-       return data->read_done_cb(task, data);
+       return data->read_done_cb ? data->read_done_cb(task, data) :
+                                   nfs4_read_done_cb(task, data);
 }
 
 static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
@@ -3245,7 +3254,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        if (!nfs4_sequence_done(task, &data->res.seq_res))
                return -EAGAIN;
-       return data->write_done_cb(task, data);
+       return data->write_done_cb ? data->write_done_cb(task, data) :
+               nfs4_write_done_cb(task, data);
 }
 
 /* Reset the the nfs_write_data to send the write to the MDS. */
@@ -5671,6 +5681,88 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
        return status;
 }
 
+static void
+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
+{
+       struct nfs4_layoutreturn *lrp = calldata;
+
+       dprintk("--> %s\n", __func__);
+       if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
+                               &lrp->res.seq_res, 0, task))
+               return;
+       rpc_call_start(task);
+}
+
+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
+{
+       struct nfs4_layoutreturn *lrp = calldata;
+       struct nfs_server *server;
+
+       dprintk("--> %s\n", __func__);
+
+       if (!nfs4_sequence_done(task, &lrp->res.seq_res))
+               return;
+
+       server = NFS_SERVER(lrp->args.inode);
+       if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+               nfs_restart_rpc(task, lrp->clp);
+               return;
+       }
+       if (task->tk_status == 0) {
+               struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
+
+               if (lrp->res.lrs_present) {
+                       spin_lock(&lo->plh_inode->i_lock);
+                       pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+                       spin_unlock(&lo->plh_inode->i_lock);
+               } else
+                       BUG_ON(!list_empty(&lo->plh_segs));
+       }
+       dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_layoutreturn_release(void *calldata)
+{
+       struct nfs4_layoutreturn *lrp = calldata;
+
+       dprintk("--> %s\n", __func__);
+       put_layout_hdr(NFS_I(lrp->args.inode)->layout);
+       kfree(calldata);
+       dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
+       .rpc_call_prepare = nfs4_layoutreturn_prepare,
+       .rpc_call_done = nfs4_layoutreturn_done,
+       .rpc_release = nfs4_layoutreturn_release,
+};
+
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
+{
+       struct rpc_task *task;
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
+               .rpc_argp = &lrp->args,
+               .rpc_resp = &lrp->res,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = lrp->clp->cl_rpcclient,
+               .rpc_message = &msg,
+               .callback_ops = &nfs4_layoutreturn_call_ops,
+               .callback_data = lrp,
+       };
+       int status;
+
+       dprintk("--> %s\n", __func__);
+       task = rpc_run_task(&task_setup_data);
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+       status = task->tk_status;
+       dprintk("<-- %s status=%d\n", __func__, status);
+       rpc_put_task(task);
+       return status;
+}
+
 static int
 _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
 {
index c3ccd2c..d869a5e 100644 (file)
@@ -338,7 +338,11 @@ static int nfs4_stat_to_errno(int);
                                1 /* layoutupdate4 layout type */ + \
                                1 /* NULL filelayout layoutupdate4 payload */)
 #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
-
+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
+                               encode_stateid_maxsz + \
+                               1 /* FIXME: opaque lrf_body always empty at the moment */)
+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
+                               1 + decode_stateid_maxsz)
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz  0
 #define decode_sequence_maxsz  0
@@ -760,7 +764,14 @@ static int nfs4_stat_to_errno(int);
                                decode_putfh_maxsz + \
                                decode_layoutcommit_maxsz + \
                                decode_getattr_maxsz)
-
+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
+                               encode_sequence_maxsz + \
+                               encode_putfh_maxsz + \
+                               encode_layoutreturn_maxsz)
+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
+                               decode_sequence_maxsz + \
+                               decode_putfh_maxsz + \
+                               decode_layoutreturn_maxsz)
 
 const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
                                      compound_encode_hdr_maxsz +
@@ -1864,6 +1875,7 @@ encode_layoutget(struct xdr_stream *xdr,
 
 static int
 encode_layoutcommit(struct xdr_stream *xdr,
+                   struct inode *inode,
                    const struct nfs4_layoutcommit_args *args,
                    struct compound_hdr *hdr)
 {
@@ -1872,7 +1884,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
        dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
                NFS_SERVER(args->inode)->pnfs_curr_ld->id);
 
-       p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE);
+       p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
        *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
        /* Only whole file layouts */
        p = xdr_encode_hyper(p, 0); /* offset */
@@ -1883,12 +1895,49 @@ encode_layoutcommit(struct xdr_stream *xdr,
        p = xdr_encode_hyper(p, args->lastbytewritten);
        *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
        *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
-       *p++ = cpu_to_be32(0); /* no file layout payload */
+
+       if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
+               NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
+                       NFS_I(inode)->layout, xdr, args);
+       else {
+               p = reserve_space(xdr, 4);
+               *p = cpu_to_be32(0); /* no layout-type payload */
+       }
 
        hdr->nops++;
        hdr->replen += decode_layoutcommit_maxsz;
        return 0;
 }
+
+static void
+encode_layoutreturn(struct xdr_stream *xdr,
+                   const struct nfs4_layoutreturn_args *args,
+                   struct compound_hdr *hdr)
+{
+       __be32 *p;
+
+       p = reserve_space(xdr, 20);
+       *p++ = cpu_to_be32(OP_LAYOUTRETURN);
+       *p++ = cpu_to_be32(0);          /* reclaim. always 0 for now */
+       *p++ = cpu_to_be32(args->layout_type);
+       *p++ = cpu_to_be32(IOMODE_ANY);
+       *p = cpu_to_be32(RETURN_FILE);
+       p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
+       p = xdr_encode_hyper(p, 0);
+       p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
+       spin_lock(&args->inode->i_lock);
+       xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
+       spin_unlock(&args->inode->i_lock);
+       if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
+               NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
+                       NFS_I(args->inode)->layout, xdr, args);
+       } else {
+               p = reserve_space(xdr, 4);
+               *p = cpu_to_be32(0);
+       }
+       hdr->nops++;
+       hdr->replen += decode_layoutreturn_maxsz;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
@@ -2706,10 +2755,12 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
 /*
  *  Encode LAYOUTCOMMIT request
  */
-static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
-                                    struct xdr_stream *xdr,
-                                    struct nfs4_layoutcommit_args *args)
+static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_layoutcommit_args *args)
 {
+       struct nfs4_layoutcommit_data *data =
+               container_of(args, struct nfs4_layoutcommit_data, args);
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
@@ -2717,10 +2768,27 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
        encode_compound_hdr(xdr, req, &hdr);
        encode_sequence(xdr, &args->seq_args, &hdr);
        encode_putfh(xdr, NFS_FH(args->inode), &hdr);
-       encode_layoutcommit(xdr, args, &hdr);
+       encode_layoutcommit(xdr, data->args.inode, args, &hdr);
        encode_getfattr(xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-       return 0;
+}
+
+/*
+ * Encode LAYOUTRETURN request
+ */
+static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_layoutreturn_args *args)
+{
+       struct compound_hdr hdr = {
+               .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+       };
+
+       encode_compound_hdr(xdr, req, &hdr);
+       encode_sequence(xdr, &args->seq_args, &hdr);
+       encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+       encode_layoutreturn(xdr, args, &hdr);
+       encode_nops(&hdr);
 }
 #endif /* CONFIG_NFS_V4_1 */
 
@@ -5203,6 +5271,27 @@ out_overflow:
        return -EIO;
 }
 
+static int decode_layoutreturn(struct xdr_stream *xdr,
+                              struct nfs4_layoutreturn_res *res)
+{
+       __be32 *p;
+       int status;
+
+       status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
+       if (status)
+               return status;
+       p = xdr_inline_decode(xdr, 4);
+       if (unlikely(!p))
+               goto out_overflow;
+       res->lrs_present = be32_to_cpup(p);
+       if (res->lrs_present)
+               status = decode_stateid(xdr, &res->stateid);
+       return status;
+out_overflow:
+       print_overflow_msg(__func__, xdr);
+       return -EIO;
+}
+
 static int decode_layoutcommit(struct xdr_stream *xdr,
                               struct rpc_rqst *req,
                               struct nfs4_layoutcommit_res *res)
@@ -6319,6 +6408,30 @@ out:
        return status;
 }
 
+/*
+ * Decode LAYOUTRETURN response
+ */
+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
+                                    struct xdr_stream *xdr,
+                                    struct nfs4_layoutreturn_res *res)
+{
+       struct compound_hdr hdr;
+       int status;
+
+       status = decode_compound_hdr(xdr, &hdr);
+       if (status)
+               goto out;
+       status = decode_sequence(xdr, &res->seq_res, rqstp);
+       if (status)
+               goto out;
+       status = decode_putfh(xdr);
+       if (status)
+               goto out;
+       status = decode_layoutreturn(xdr, res);
+out:
+       return status;
+}
+
 /*
  * Decode LAYOUTCOMMIT response
  */
@@ -6547,6 +6660,7 @@ struct rpc_procinfo       nfs4_procedures[] = {
        PROC(GETDEVICEINFO,     enc_getdeviceinfo,      dec_getdeviceinfo),
        PROC(LAYOUTGET,         enc_layoutget,          dec_layoutget),
        PROC(LAYOUTCOMMIT,      enc_layoutcommit,       dec_layoutcommit),
+       PROC(LAYOUTRETURN,      enc_layoutreturn,       dec_layoutreturn),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
new file mode 100644 (file)
index 0000000..ed30ea0
--- /dev/null
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Objects Layout Driver kernel module
+#
+objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
new file mode 100644 (file)
index 0000000..9cf208d
--- /dev/null
@@ -0,0 +1,1057 @@
+/*
+ *  pNFS Objects layout implementation over open-osd initiator library
+ *
+ *  Copyright (C) 2009 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <scsi/osd_initiator.h>
+
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+#define _LLU(x) ((unsigned long long)x)
+
+enum { BIO_MAX_PAGES_KMALLOC =
+               (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+};
+
+struct objio_dev_ent {
+       struct nfs4_deviceid_node id_node;
+       struct osd_dev *od;
+};
+
+static void
+objio_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+       struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
+
+       dprintk("%s: free od=%p\n", __func__, de->od);
+       osduld_put_device(de->od);
+       kfree(de);
+}
+
+static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
+       const struct nfs4_deviceid *d_id)
+{
+       struct nfs4_deviceid_node *d;
+       struct objio_dev_ent *de;
+
+       d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
+       if (!d)
+               return NULL;
+
+       de = container_of(d, struct objio_dev_ent, id_node);
+       return de;
+}
+
+static struct objio_dev_ent *
+_dev_list_add(const struct nfs_server *nfss,
+       const struct nfs4_deviceid *d_id, struct osd_dev *od,
+       gfp_t gfp_flags)
+{
+       struct nfs4_deviceid_node *d;
+       struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
+       struct objio_dev_ent *n;
+
+       if (!de) {
+               dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+               return NULL;
+       }
+
+       dprintk("%s: Adding od=%p\n", __func__, od);
+       nfs4_init_deviceid_node(&de->id_node,
+                               nfss->pnfs_curr_ld,
+                               nfss->nfs_client,
+                               d_id);
+       de->od = od;
+
+       d = nfs4_insert_deviceid_node(&de->id_node);
+       n = container_of(d, struct objio_dev_ent, id_node);
+       if (n != de) {
+               dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
+               objio_free_deviceid_node(&de->id_node);
+               de = n;
+       }
+
+       atomic_inc(&de->id_node.ref);
+       return de;
+}
+
+struct caps_buffers {
+       u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
+       u8 creds[OSD_CAP_LEN];
+};
+
+struct objio_segment {
+       struct pnfs_layout_segment lseg;
+
+       struct pnfs_osd_object_cred *comps;
+
+       unsigned mirrors_p1;
+       unsigned stripe_unit;
+       unsigned group_width;   /* Data stripe_units without integrity comps */
+       u64 group_depth;
+       unsigned group_count;
+
+       unsigned max_io_size;
+
+       unsigned comps_index;
+       unsigned num_comps;
+       /* variable length */
+       struct objio_dev_ent *ods[];
+};
+
+static inline struct objio_segment *
+OBJIO_LSEG(struct pnfs_layout_segment *lseg)
+{
+       return container_of(lseg, struct objio_segment, lseg);
+}
+
+struct objio_state;
+typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
+
+struct objio_state {
+       /* Generic layer */
+       struct objlayout_io_state ol_state;
+
+       struct objio_segment *layout;
+
+       struct kref kref;
+       objio_done_fn done;
+       void *private;
+
+       unsigned long length;
+       unsigned numdevs; /* Actually used devs in this IO */
+       /* A per-device variable array of size numdevs */
+       struct _objio_per_comp {
+               struct bio *bio;
+               struct osd_request *or;
+               unsigned long length;
+               u64 offset;
+               unsigned dev;
+       } per_dev[];
+};
+
+/* Send and wait for a get_device_info of devices in the layout,
+   then look them up with the osd_initiator library */
+static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
+                               struct objio_segment *objio_seg, unsigned comp,
+                               gfp_t gfp_flags)
+{
+       struct pnfs_osd_deviceaddr *deviceaddr;
+       struct nfs4_deviceid *d_id;
+       struct objio_dev_ent *ode;
+       struct osd_dev *od;
+       struct osd_dev_info odi;
+       int err;
+
+       d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
+
+       ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
+       if (ode)
+               return ode;
+
+       err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
+       if (unlikely(err)) {
+               dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
+                       __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
+               return ERR_PTR(err);
+       }
+
+       odi.systemid_len = deviceaddr->oda_systemid.len;
+       if (odi.systemid_len > sizeof(odi.systemid)) {
+               err = -EINVAL;
+               goto out;
+       } else if (odi.systemid_len)
+               memcpy(odi.systemid, deviceaddr->oda_systemid.data,
+                      odi.systemid_len);
+       odi.osdname_len  = deviceaddr->oda_osdname.len;
+       odi.osdname      = (u8 *)deviceaddr->oda_osdname.data;
+
+       if (!odi.osdname_len && !odi.systemid_len) {
+               dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
+                       __func__);
+               err = -ENODEV;
+               goto out;
+       }
+
+       od = osduld_info_lookup(&odi);
+       if (unlikely(IS_ERR(od))) {
+               err = PTR_ERR(od);
+               dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
+               goto out;
+       }
+
+       ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
+                           gfp_flags);
+
+out:
+       dprintk("%s: return=%d\n", __func__, err);
+       objlayout_put_deviceinfo(deviceaddr);
+       return err ? ERR_PTR(err) : ode;
+}
+
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+       struct objio_segment *objio_seg,
+       gfp_t gfp_flags)
+{
+       unsigned i;
+       int err;
+
+       /* lookup all devices */
+       for (i = 0; i < objio_seg->num_comps; i++) {
+               struct objio_dev_ent *ode;
+
+               ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
+               if (unlikely(IS_ERR(ode))) {
+                       err = PTR_ERR(ode);
+                       goto out;
+               }
+               objio_seg->ods[i] = ode;
+       }
+       err = 0;
+
+out:
+       dprintk("%s: return=%d\n", __func__, err);
+       return err;
+}
+
+static int _verify_data_map(struct pnfs_osd_layout *layout)
+{
+       struct pnfs_osd_data_map *data_map = &layout->olo_map;
+       u64 stripe_length;
+       u32 group_width;
+
+/* FIXME: Only raid0 for now. if not go through MDS */
+       if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
+               printk(KERN_ERR "Only RAID_0 for now\n");
+               return -ENOTSUPP;
+       }
+       if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
+               printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
+                         data_map->odm_num_comps, data_map->odm_mirror_cnt);
+               return -EINVAL;
+       }
+
+       if (data_map->odm_group_width)
+               group_width = data_map->odm_group_width;
+       else
+               group_width = data_map->odm_num_comps /
+                                               (data_map->odm_mirror_cnt + 1);
+
+       stripe_length = (u64)data_map->odm_stripe_unit * group_width;
+       if (stripe_length >= (1ULL << 32)) {
+               printk(KERN_ERR "Total Stripe length(0x%llx)"
+                         " >= 32bit is not supported\n", _LLU(stripe_length));
+               return -ENOTSUPP;
+       }
+
+       if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
+               printk(KERN_ERR "Stripe Unit(0x%llx)"
+                         " must be Multples of PAGE_SIZE(0x%lx)\n",
+                         _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
+               return -ENOTSUPP;
+       }
+
+       return 0;
+}
+
+static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
+                            struct pnfs_osd_object_cred *src_comp,
+                            struct caps_buffers *caps_p)
+{
+       WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
+       WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
+
+       *cur_comp = *src_comp;
+
+       memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
+              sizeof(caps_p->caps_key));
+       cur_comp->oc_cap_key.cred = caps_p->caps_key;
+
+       memcpy(caps_p->creds, src_comp->oc_cap.cred,
+              sizeof(caps_p->creds));
+       cur_comp->oc_cap.cred = caps_p->creds;
+}
+
+int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+       struct pnfs_layout_hdr *pnfslay,
+       struct pnfs_layout_range *range,
+       struct xdr_stream *xdr,
+       gfp_t gfp_flags)
+{
+       struct objio_segment *objio_seg;
+       struct pnfs_osd_xdr_decode_layout_iter iter;
+       struct pnfs_osd_layout layout;
+       struct pnfs_osd_object_cred *cur_comp, src_comp;
+       struct caps_buffers *caps_p;
+       int err;
+
+       err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
+       if (unlikely(err))
+               return err;
+
+       err = _verify_data_map(&layout);
+       if (unlikely(err))
+               return err;
+
+       objio_seg = kzalloc(sizeof(*objio_seg) +
+                           sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
+                           sizeof(*objio_seg->comps) * layout.olo_num_comps +
+                           sizeof(struct caps_buffers) * layout.olo_num_comps,
+                           gfp_flags);
+       if (!objio_seg)
+               return -ENOMEM;
+
+       objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
+       cur_comp = objio_seg->comps;
+       caps_p = (void *)(cur_comp + layout.olo_num_comps);
+       while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
+               copy_single_comp(cur_comp++, &src_comp, caps_p++);
+       if (unlikely(err))
+               goto err;
+
+       objio_seg->num_comps = layout.olo_num_comps;
+       objio_seg->comps_index = layout.olo_comps_index;
+       err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
+       if (err)
+               goto err;
+
+       objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
+       objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
+       if (layout.olo_map.odm_group_width) {
+               objio_seg->group_width = layout.olo_map.odm_group_width;
+               objio_seg->group_depth = layout.olo_map.odm_group_depth;
+               objio_seg->group_count = layout.olo_map.odm_num_comps /
+                                               objio_seg->mirrors_p1 /
+                                               objio_seg->group_width;
+       } else {
+               objio_seg->group_width = layout.olo_map.odm_num_comps /
+                                               objio_seg->mirrors_p1;
+               objio_seg->group_depth = -1;
+               objio_seg->group_count = 1;
+       }
+
+       /* Cache this calculation it will hit for every page */
+       objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
+                                 objio_seg->stripe_unit) *
+                                objio_seg->group_width;
+
+       *outp = &objio_seg->lseg;
+       return 0;
+
+err:
+       kfree(objio_seg);
+       dprintk("%s: Error: return %d\n", __func__, err);
+       *outp = NULL;
+       return err;
+}
+
+void objio_free_lseg(struct pnfs_layout_segment *lseg)
+{
+       int i;
+       struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+
+       for (i = 0; i < objio_seg->num_comps; i++) {
+               if (!objio_seg->ods[i])
+                       break;
+               nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
+       }
+       kfree(objio_seg);
+}
+
+int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
+                        struct objlayout_io_state **outp,
+                        gfp_t gfp_flags)
+{
+       struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+       struct objio_state *ios;
+       const unsigned first_size = sizeof(*ios) +
+                               objio_seg->num_comps * sizeof(ios->per_dev[0]);
+       const unsigned sec_size = objio_seg->num_comps *
+                                               sizeof(ios->ol_state.ioerrs[0]);
+
+       ios = kzalloc(first_size + sec_size, gfp_flags);
+       if (unlikely(!ios))
+               return -ENOMEM;
+
+       ios->layout = objio_seg;
+       ios->ol_state.ioerrs = ((void *)ios) + first_size;
+       ios->ol_state.num_comps = objio_seg->num_comps;
+
+       *outp = &ios->ol_state;
+       return 0;
+}
+
+void objio_free_io_state(struct objlayout_io_state *ol_state)
+{
+       struct objio_state *ios = container_of(ol_state, struct objio_state,
+                                              ol_state);
+
+       kfree(ios);
+}
+
+enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
+{
+       switch (oep) {
+       case OSD_ERR_PRI_NO_ERROR:
+               return (enum pnfs_osd_errno)0;
+
+       case OSD_ERR_PRI_CLEAR_PAGES:
+               BUG_ON(1);
+               return 0;
+
+       case OSD_ERR_PRI_RESOURCE:
+               return PNFS_OSD_ERR_RESOURCE;
+       case OSD_ERR_PRI_BAD_CRED:
+               return PNFS_OSD_ERR_BAD_CRED;
+       case OSD_ERR_PRI_NO_ACCESS:
+               return PNFS_OSD_ERR_NO_ACCESS;
+       case OSD_ERR_PRI_UNREACHABLE:
+               return PNFS_OSD_ERR_UNREACHABLE;
+       case OSD_ERR_PRI_NOT_FOUND:
+               return PNFS_OSD_ERR_NOT_FOUND;
+       case OSD_ERR_PRI_NO_SPACE:
+               return PNFS_OSD_ERR_NO_SPACE;
+       default:
+               WARN_ON(1);
+               /* fallthrough */
+       case OSD_ERR_PRI_EIO:
+               return PNFS_OSD_ERR_EIO;
+       }
+}
+
+static void _clear_bio(struct bio *bio)
+{
+       struct bio_vec *bv;
+       unsigned i;
+
+       __bio_for_each_segment(bv, bio, i, 0) {
+               unsigned this_count = bv->bv_len;
+
+               if (likely(PAGE_SIZE == this_count))
+                       clear_highpage(bv->bv_page);
+               else
+                       zero_user(bv->bv_page, bv->bv_offset, this_count);
+       }
+}
+
+static int _io_check(struct objio_state *ios, bool is_write)
+{
+       enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
+       int lin_ret = 0;
+       int i;
+
+       for (i = 0; i <  ios->numdevs; i++) {
+               struct osd_sense_info osi;
+               struct osd_request *or = ios->per_dev[i].or;
+               unsigned dev;
+               int ret;
+
+               if (!or)
+                       continue;
+
+               ret = osd_req_decode_sense(or, &osi);
+               if (likely(!ret))
+                       continue;
+
+               if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+                       /* start read offset passed endof file */
+                       BUG_ON(is_write);
+                       _clear_bio(ios->per_dev[i].bio);
+                       dprintk("%s: start read offset passed end of file "
+                               "offset=0x%llx, length=0x%lx\n", __func__,
+                               _LLU(ios->per_dev[i].offset),
+                               ios->per_dev[i].length);
+
+                       continue; /* we recovered */
+               }
+               dev = ios->per_dev[i].dev;
+               objlayout_io_set_result(&ios->ol_state, dev,
+                                       &ios->layout->comps[dev].oc_object_id,
+                                       osd_pri_2_pnfs_err(osi.osd_err_pri),
+                                       ios->per_dev[i].offset,
+                                       ios->per_dev[i].length,
+                                       is_write);
+
+               if (osi.osd_err_pri >= oep) {
+                       oep = osi.osd_err_pri;
+                       lin_ret = ret;
+               }
+       }
+
+       return lin_ret;
+}
+
+/*
+ * Common IO state helpers.
+ */
+static void _io_free(struct objio_state *ios)
+{
+       unsigned i;
+
+       for (i = 0; i < ios->numdevs; i++) {
+               struct _objio_per_comp *per_dev = &ios->per_dev[i];
+
+               if (per_dev->or) {
+                       osd_end_request(per_dev->or);
+                       per_dev->or = NULL;
+               }
+
+               if (per_dev->bio) {
+                       bio_put(per_dev->bio);
+                       per_dev->bio = NULL;
+               }
+       }
+}
+
+struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
+{
+       unsigned min_dev = ios->layout->comps_index;
+       unsigned max_dev = min_dev + ios->layout->num_comps;
+
+       BUG_ON(dev < min_dev || max_dev <= dev);
+       return ios->layout->ods[dev - min_dev]->od;
+}
+
+struct _striping_info {
+       u64 obj_offset;
+       u64 group_length;
+       unsigned dev;
+       unsigned unit_off;
+};
+
+static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
+                             struct _striping_info *si)
+{
+       u32     stripe_unit = ios->layout->stripe_unit;
+       u32     group_width = ios->layout->group_width;
+       u64     group_depth = ios->layout->group_depth;
+       u32     U = stripe_unit * group_width;
+
+       u64     T = U * group_depth;
+       u64     S = T * ios->layout->group_count;
+       u64     M = div64_u64(file_offset, S);
+
+       /*
+       G = (L - (M * S)) / T
+       H = (L - (M * S)) % T
+       */
+       u64     LmodU = file_offset - M * S;
+       u32     G = div64_u64(LmodU, T);
+       u64     H = LmodU - G * T;
+
+       u32     N = div_u64(H, U);
+
+       div_u64_rem(file_offset, stripe_unit, &si->unit_off);
+       si->obj_offset = si->unit_off + (N * stripe_unit) +
+                                 (M * group_depth * stripe_unit);
+
+       /* "H - (N * U)" is just "H % U" so it's bound to u32 */
+       si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+       si->dev *= ios->layout->mirrors_p1;
+
+       si->group_length = T - H;
+}
+
+static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
+               unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
+               gfp_t gfp_flags)
+{
+       unsigned pg = *cur_pg;
+       struct request_queue *q =
+                       osd_request_queue(_io_od(ios, per_dev->dev));
+
+       per_dev->length += cur_len;
+
+       if (per_dev->bio == NULL) {
+               unsigned stripes = ios->layout->num_comps /
+                                                    ios->layout->mirrors_p1;
+               unsigned pages_in_stripe = stripes *
+                                     (ios->layout->stripe_unit / PAGE_SIZE);
+               unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
+                                   stripes;
+
+               if (BIO_MAX_PAGES_KMALLOC < bio_size)
+                       bio_size = BIO_MAX_PAGES_KMALLOC;
+
+               per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
+               if (unlikely(!per_dev->bio)) {
+                       dprintk("Faild to allocate BIO size=%u\n", bio_size);
+                       return -ENOMEM;
+               }
+       }
+
+       while (cur_len > 0) {
+               unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+               unsigned added_len;
+
+               BUG_ON(ios->ol_state.nr_pages <= pg);
+               cur_len -= pglen;
+
+               added_len = bio_add_pc_page(q, per_dev->bio,
+                                       ios->ol_state.pages[pg], pglen, pgbase);
+               if (unlikely(pglen != added_len))
+                       return -ENOMEM;
+               pgbase = 0;
+               ++pg;
+       }
+       BUG_ON(cur_len);
+
+       *cur_pg = pg;
+       return 0;
+}
+
+static int _prepare_one_group(struct objio_state *ios, u64 length,
+                             struct _striping_info *si, unsigned *last_pg,
+                             gfp_t gfp_flags)
+{
+       unsigned stripe_unit = ios->layout->stripe_unit;
+       unsigned mirrors_p1 = ios->layout->mirrors_p1;
+       unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+       unsigned dev = si->dev;
+       unsigned first_dev = dev - (dev % devs_in_group);
+       unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+       unsigned cur_pg = *last_pg;
+       int ret = 0;
+
+       while (length) {
+               struct _objio_per_comp *per_dev = &ios->per_dev[dev];
+               unsigned cur_len, page_off = 0;
+
+               if (!per_dev->length) {
+                       per_dev->dev = dev;
+                       if (dev < si->dev) {
+                               per_dev->offset = si->obj_offset + stripe_unit -
+                                                                  si->unit_off;
+                               cur_len = stripe_unit;
+                       } else if (dev == si->dev) {
+                               per_dev->offset = si->obj_offset;
+                               cur_len = stripe_unit - si->unit_off;
+                               page_off = si->unit_off & ~PAGE_MASK;
+                               BUG_ON(page_off &&
+                                     (page_off != ios->ol_state.pgbase));
+                       } else { /* dev > si->dev */
+                               per_dev->offset = si->obj_offset - si->unit_off;
+                               cur_len = stripe_unit;
+                       }
+
+                       if (max_comp < dev)
+                               max_comp = dev;
+               } else {
+                       cur_len = stripe_unit;
+               }
+               if (cur_len >= length)
+                       cur_len = length;
+
+               ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+                                      cur_len, gfp_flags);
+               if (unlikely(ret))
+                       goto out;
+
+               dev += mirrors_p1;
+               dev = (dev % devs_in_group) + first_dev;
+
+               length -= cur_len;
+               ios->length += cur_len;
+       }
+out:
+       ios->numdevs = max_comp + mirrors_p1;
+       *last_pg = cur_pg;
+       return ret;
+}
+
+static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
+{
+       u64 length = ios->ol_state.count;
+       u64 offset = ios->ol_state.offset;
+       struct _striping_info si;
+       unsigned last_pg = 0;
+       int ret = 0;
+
+       while (length) {
+               _calc_stripe_info(ios, offset, &si);
+
+               if (length < si.group_length)
+                       si.group_length = length;
+
+               ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
+               if (unlikely(ret))
+                       goto out;
+
+               offset += si.group_length;
+               length -= si.group_length;
+       }
+
+out:
+       if (!ios->length)
+               return ret;
+
+       return 0;
+}
+
+static ssize_t _sync_done(struct objio_state *ios)
+{
+       struct completion *waiting = ios->private;
+
+       complete(waiting);
+       return 0;
+}
+
+static void _last_io(struct kref *kref)
+{
+       struct objio_state *ios = container_of(kref, struct objio_state, kref);
+
+       ios->done(ios);
+}
+
+static void _done_io(struct osd_request *or, void *p)
+{
+       struct objio_state *ios = p;
+
+       kref_put(&ios->kref, _last_io);
+}
+
+static ssize_t _io_exec(struct objio_state *ios)
+{
+       DECLARE_COMPLETION_ONSTACK(wait);
+       ssize_t status = 0; /* sync status */
+       unsigned i;
+       objio_done_fn saved_done_fn = ios->done;
+       bool sync = ios->ol_state.sync;
+
+       if (sync) {
+               ios->done = _sync_done;
+               ios->private = &wait;
+       }
+
+       kref_init(&ios->kref);
+
+       for (i = 0; i < ios->numdevs; i++) {
+               struct osd_request *or = ios->per_dev[i].or;
+
+               if (!or)
+                       continue;
+
+               kref_get(&ios->kref);
+               osd_execute_request_async(or, _done_io, ios);
+       }
+
+       kref_put(&ios->kref, _last_io);
+
+       if (sync) {
+               wait_for_completion(&wait);
+               status = saved_done_fn(ios);
+       }
+
+       return status;
+}
+
+/*
+ * read
+ */
+static ssize_t _read_done(struct objio_state *ios)
+{
+       ssize_t status;
+       int ret = _io_check(ios, false);
+
+       _io_free(ios);
+
+       if (likely(!ret))
+               status = ios->length;
+       else
+               status = ret;
+
+       objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
+       return status;
+}
+
+static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+       struct osd_request *or = NULL;
+       struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+       unsigned dev = per_dev->dev;
+       struct pnfs_osd_object_cred *cred =
+                       &ios->layout->comps[dev];
+       struct osd_obj_id obj = {
+               .partition = cred->oc_object_id.oid_partition_id,
+               .id = cred->oc_object_id.oid_object_id,
+       };
+       int ret;
+
+       or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
+       if (unlikely(!or)) {
+               ret = -ENOMEM;
+               goto err;
+       }
+       per_dev->or = or;
+
+       osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
+
+       ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+       if (ret) {
+               dprintk("%s: Faild to osd_finalize_request() => %d\n",
+                       __func__, ret);
+               goto err;
+       }
+
+       dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+               __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+               per_dev->length);
+
+err:
+       return ret;
+}
+
+static ssize_t _read_exec(struct objio_state *ios)
+{
+       unsigned i;
+       int ret;
+
+       for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+               if (!ios->per_dev[i].length)
+                       continue;
+               ret = _read_mirrors(ios, i);
+               if (unlikely(ret))
+                       goto err;
+       }
+
+       ios->done = _read_done;
+       return _io_exec(ios); /* In sync mode exec returns the io status */
+
+err:
+       _io_free(ios);
+       return ret;
+}
+
+ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
+{
+       struct objio_state *ios = container_of(ol_state, struct objio_state,
+                                              ol_state);
+       int ret;
+
+       ret = _io_rw_pagelist(ios, GFP_KERNEL);
+       if (unlikely(ret))
+               return ret;
+
+       return _read_exec(ios);
+}
+
+/*
+ * write
+ */
+static ssize_t _write_done(struct objio_state *ios)
+{
+       ssize_t status;
+       int ret = _io_check(ios, true);
+
+       _io_free(ios);
+
+       if (likely(!ret)) {
+               /* FIXME: should be based on the OSD's persistence model
+                * See OSD2r05 Section 4.13 Data persistence model */
+               ios->ol_state.committed = NFS_FILE_SYNC;
+               status = ios->length;
+       } else {
+               status = ret;
+       }
+
+       objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
+       return status;
+}
+
+static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+       struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
+       unsigned dev = ios->per_dev[cur_comp].dev;
+       unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+       int ret;
+
+       for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+               struct osd_request *or = NULL;
+               struct pnfs_osd_object_cred *cred =
+                                       &ios->layout->comps[dev];
+               struct osd_obj_id obj = {
+                       .partition = cred->oc_object_id.oid_partition_id,
+                       .id = cred->oc_object_id.oid_object_id,
+               };
+               struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+               struct bio *bio;
+
+               or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
+               if (unlikely(!or)) {
+                       ret = -ENOMEM;
+                       goto err;
+               }
+               per_dev->or = or;
+
+               if (per_dev != master_dev) {
+                       bio = bio_kmalloc(GFP_NOFS,
+                                         master_dev->bio->bi_max_vecs);
+                       if (unlikely(!bio)) {
+                               dprintk("Faild to allocate BIO size=%u\n",
+                                       master_dev->bio->bi_max_vecs);
+                               ret = -ENOMEM;
+                               goto err;
+                       }
+
+                       __bio_clone(bio, master_dev->bio);
+                       bio->bi_bdev = NULL;
+                       bio->bi_next = NULL;
+                       per_dev->bio = bio;
+                       per_dev->dev = dev;
+                       per_dev->length = master_dev->length;
+                       per_dev->offset =  master_dev->offset;
+               } else {
+                       bio = master_dev->bio;
+                       bio->bi_rw |= REQ_WRITE;
+               }
+
+               osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
+
+               ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+               if (ret) {
+                       dprintk("%s: Faild to osd_finalize_request() => %d\n",
+                               __func__, ret);
+                       goto err;
+               }
+
+               dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+                       __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+                       per_dev->length);
+       }
+
+err:
+       return ret;
+}
+
+static ssize_t _write_exec(struct objio_state *ios)
+{
+       unsigned i;
+       int ret;
+
+       for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+               if (!ios->per_dev[i].length)
+                       continue;
+               ret = _write_mirrors(ios, i);
+               if (unlikely(ret))
+                       goto err;
+       }
+
+       ios->done = _write_done;
+       return _io_exec(ios); /* In sync mode exec returns the io->status */
+
+err:
+       _io_free(ios);
+       return ret;
+}
+
+ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+{
+       struct objio_state *ios = container_of(ol_state, struct objio_state,
+                                              ol_state);
+       int ret;
+
+       /* TODO: ios->stable = stable; */
+       ret = _io_rw_pagelist(ios, GFP_NOFS);
+       if (unlikely(ret))
+               return ret;
+
+       return _write_exec(ios);
+}
+
+static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
+                         struct nfs_page *prev, struct nfs_page *req)
+{
+       if (!pnfs_generic_pg_test(pgio, prev, req))
+               return false;
+
+       return pgio->pg_count + req->wb_bytes <=
+                       OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
+}
+
+static struct pnfs_layoutdriver_type objlayout_type = {
+       .id = LAYOUT_OSD2_OBJECTS,
+       .name = "LAYOUT_OSD2_OBJECTS",
+       .flags                   = PNFS_LAYOUTRET_ON_SETATTR,
+
+       .alloc_layout_hdr        = objlayout_alloc_layout_hdr,
+       .free_layout_hdr         = objlayout_free_layout_hdr,
+
+       .alloc_lseg              = objlayout_alloc_lseg,
+       .free_lseg               = objlayout_free_lseg,
+
+       .read_pagelist           = objlayout_read_pagelist,
+       .write_pagelist          = objlayout_write_pagelist,
+       .pg_test                 = objio_pg_test,
+
+       .free_deviceid_node      = objio_free_deviceid_node,
+
+       .encode_layoutcommit     = objlayout_encode_layoutcommit,
+       .encode_layoutreturn     = objlayout_encode_layoutreturn,
+};
+
+MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
+MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
+MODULE_LICENSE("GPL");
+
+static int __init
+objlayout_init(void)
+{
+       int ret = pnfs_register_layoutdriver(&objlayout_type);
+
+       if (ret)
+               printk(KERN_INFO
+                       "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
+                       __func__, ret);
+       else
+               printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
+                       __func__);
+       return ret;
+}
+
+static void __exit
+objlayout_exit(void)
+{
+       pnfs_unregister_layoutdriver(&objlayout_type);
+       printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
+              __func__);
+}
+
+module_init(objlayout_init);
+module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
new file mode 100644 (file)
index 0000000..dc3956c
--- /dev/null
@@ -0,0 +1,712 @@
+/*
+ *  pNFS Objects layout driver high level definitions
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <scsi/osd_initiator.h>
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+/*
+ * Create a objlayout layout structure for the given inode and return it.
+ */
+struct pnfs_layout_hdr *
+objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+       struct objlayout *objlay;
+
+       objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
+       if (objlay) {
+               spin_lock_init(&objlay->lock);
+               INIT_LIST_HEAD(&objlay->err_list);
+       }
+       dprintk("%s: Return %p\n", __func__, objlay);
+       return &objlay->pnfs_layout;
+}
+
+/*
+ * Free an objlayout layout structure
+ */
+void
+objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+       struct objlayout *objlay = OBJLAYOUT(lo);
+
+       dprintk("%s: objlay %p\n", __func__, objlay);
+
+       WARN_ON(!list_empty(&objlay->err_list));
+       kfree(objlay);
+}
+
+/*
+ * Unmarshall layout and store it in pnfslay.
+ */
+struct pnfs_layout_segment *
+objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
+                    struct nfs4_layoutget_res *lgr,
+                    gfp_t gfp_flags)
+{
+       int status = -ENOMEM;
+       struct xdr_stream stream;
+       struct xdr_buf buf = {
+               .pages =  lgr->layoutp->pages,
+               .page_len =  lgr->layoutp->len,
+               .buflen =  lgr->layoutp->len,
+               .len = lgr->layoutp->len,
+       };
+       struct page *scratch;
+       struct pnfs_layout_segment *lseg;
+
+       dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
+
+       scratch = alloc_page(gfp_flags);
+       if (!scratch)
+               goto err_nofree;
+
+       xdr_init_decode(&stream, &buf, NULL);
+       xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+       status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
+       if (unlikely(status)) {
+               dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
+                       status);
+               goto err;
+       }
+
+       __free_page(scratch);
+
+       dprintk("%s: Return %p\n", __func__, lseg);
+       return lseg;
+
+err:
+       __free_page(scratch);
+err_nofree:
+       dprintk("%s: Err Return=>%d\n", __func__, status);
+       return ERR_PTR(status);
+}
+
+/*
+ * Free a layout segement
+ */
+void
+objlayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+       dprintk("%s: freeing layout segment %p\n", __func__, lseg);
+
+       if (unlikely(!lseg))
+               return;
+
+       objio_free_lseg(lseg);
+}
+
+/*
+ * I/O Operations
+ */
+static inline u64
+end_offset(u64 start, u64 len)
+{
+       u64 end;
+
+       end = start + len;
+       return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+       u64 end;
+
+       BUG_ON(!len);
+       end = start + len;
+       return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
+static struct objlayout_io_state *
+objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+                       struct page **pages,
+                       unsigned pgbase,
+                       loff_t offset,
+                       size_t count,
+                       struct pnfs_layout_segment *lseg,
+                       void *rpcdata,
+                       gfp_t gfp_flags)
+{
+       struct objlayout_io_state *state;
+       u64 lseg_end_offset;
+
+       dprintk("%s: allocating io_state\n", __func__);
+       if (objio_alloc_io_state(lseg, &state, gfp_flags))
+               return NULL;
+
+       BUG_ON(offset < lseg->pls_range.offset);
+       lseg_end_offset = end_offset(lseg->pls_range.offset,
+                                    lseg->pls_range.length);
+       BUG_ON(offset >= lseg_end_offset);
+       if (offset + count > lseg_end_offset) {
+               count = lseg->pls_range.length -
+                               (offset - lseg->pls_range.offset);
+               dprintk("%s: truncated count %Zd\n", __func__, count);
+       }
+
+       if (pgbase > PAGE_SIZE) {
+               pages += pgbase >> PAGE_SHIFT;
+               pgbase &= ~PAGE_MASK;
+       }
+
+       INIT_LIST_HEAD(&state->err_list);
+       state->lseg = lseg;
+       state->rpcdata = rpcdata;
+       state->pages = pages;
+       state->pgbase = pgbase;
+       state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       state->offset = offset;
+       state->count = count;
+       state->sync = 0;
+
+       return state;
+}
+
+static void
+objlayout_free_io_state(struct objlayout_io_state *state)
+{
+       dprintk("%s: freeing io_state\n", __func__);
+       if (unlikely(!state))
+               return;
+
+       objio_free_io_state(state);
+}
+
+/*
+ * I/O done common code
+ */
+static void
+objlayout_iodone(struct objlayout_io_state *state)
+{
+       dprintk("%s: state %p status\n", __func__, state);
+
+       if (likely(state->status >= 0)) {
+               objlayout_free_io_state(state);
+       } else {
+               struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+
+               spin_lock(&objlay->lock);
+               objlay->delta_space_valid = OBJ_DSU_INVALID;
+               list_add(&objlay->err_list, &state->err_list);
+               spin_unlock(&objlay->lock);
+       }
+}
+
+/*
+ * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
+ *
+ * The @index component IO failed (error returned from target). Register
+ * the error for later reporting at layout-return.
+ */
+void
+objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+                       struct pnfs_osd_objid *pooid, int osd_error,
+                       u64 offset, u64 length, bool is_write)
+{
+       struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+
+       BUG_ON(index >= state->num_comps);
+       if (osd_error) {
+               ioerr->oer_component = *pooid;
+               ioerr->oer_comp_offset = offset;
+               ioerr->oer_comp_length = length;
+               ioerr->oer_iswrite = is_write;
+               ioerr->oer_errno = osd_error;
+
+               dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
+                       "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
+                       __func__, index, ioerr->oer_errno,
+                       ioerr->oer_iswrite,
+                       _DEVID_LO(&ioerr->oer_component.oid_device_id),
+                       _DEVID_HI(&ioerr->oer_component.oid_device_id),
+                       ioerr->oer_component.oid_partition_id,
+                       ioerr->oer_component.oid_object_id,
+                       ioerr->oer_comp_offset,
+                       ioerr->oer_comp_length);
+       } else {
+               /* User need not call if no error is reported */
+               ioerr->oer_errno = 0;
+       }
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_read_complete(struct work_struct *work)
+{
+       struct rpc_task *task;
+       struct nfs_read_data *rdata;
+
+       dprintk("%s enter\n", __func__);
+       task = container_of(work, struct rpc_task, u.tk_work);
+       rdata = container_of(task, struct nfs_read_data, task);
+
+       pnfs_ld_read_done(rdata);
+}
+
+void
+objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+{
+       int eof = state->eof;
+       struct nfs_read_data *rdata;
+
+       state->status = status;
+       dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
+       rdata = state->rpcdata;
+       rdata->task.tk_status = status;
+       if (status >= 0) {
+               rdata->res.count = status;
+               rdata->res.eof = eof;
+       }
+       objlayout_iodone(state);
+       /* must not use state after this point */
+
+       if (sync)
+               pnfs_ld_read_done(rdata);
+       else {
+               INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
+               schedule_work(&rdata->task.u.tk_work);
+       }
+}
+
+/*
+ * Perform sync or async reads.
+ */
+enum pnfs_try_status
+objlayout_read_pagelist(struct nfs_read_data *rdata)
+{
+       loff_t offset = rdata->args.offset;
+       size_t count = rdata->args.count;
+       struct objlayout_io_state *state;
+       ssize_t status = 0;
+       loff_t eof;
+
+       dprintk("%s: Begin inode %p offset %llu count %d\n",
+               __func__, rdata->inode, offset, (int)count);
+
+       eof = i_size_read(rdata->inode);
+       if (unlikely(offset + count > eof)) {
+               if (offset >= eof) {
+                       status = 0;
+                       rdata->res.count = 0;
+                       rdata->res.eof = 1;
+                       goto out;
+               }
+               count = eof - offset;
+       }
+
+       state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
+                                        rdata->args.pages, rdata->args.pgbase,
+                                        offset, count,
+                                        rdata->lseg, rdata,
+                                        GFP_KERNEL);
+       if (unlikely(!state)) {
+               status = -ENOMEM;
+               goto out;
+       }
+
+       state->eof = state->offset + state->count >= eof;
+
+       status = objio_read_pagelist(state);
+ out:
+       dprintk("%s: Return status %Zd\n", __func__, status);
+       rdata->pnfs_error = status;
+       return PNFS_ATTEMPTED;
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_write_complete(struct work_struct *work)
+{
+       struct rpc_task *task;
+       struct nfs_write_data *wdata;
+
+       dprintk("%s enter\n", __func__);
+       task = container_of(work, struct rpc_task, u.tk_work);
+       wdata = container_of(task, struct nfs_write_data, task);
+
+       pnfs_ld_write_done(wdata);
+}
+
+void
+objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
+                    bool sync)
+{
+       struct nfs_write_data *wdata;
+
+       dprintk("%s: Begin\n", __func__);
+       wdata = state->rpcdata;
+       state->status = status;
+       wdata->task.tk_status = status;
+       if (status >= 0) {
+               wdata->res.count = status;
+               wdata->verf.committed = state->committed;
+               dprintk("%s: Return status %d committed %d\n",
+                       __func__, wdata->task.tk_status,
+                       wdata->verf.committed);
+       } else
+               dprintk("%s: Return status %d\n",
+                       __func__, wdata->task.tk_status);
+       objlayout_iodone(state);
+       /* must not use state after this point */
+
+       if (sync)
+               pnfs_ld_write_done(wdata);
+       else {
+               INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
+               schedule_work(&wdata->task.u.tk_work);
+       }
+}
+
+/*
+ * Perform sync or async writes.
+ */
+enum pnfs_try_status
+objlayout_write_pagelist(struct nfs_write_data *wdata,
+                        int how)
+{
+       struct objlayout_io_state *state;
+       ssize_t status;
+
+       dprintk("%s: Begin inode %p offset %llu count %u\n",
+               __func__, wdata->inode, wdata->args.offset, wdata->args.count);
+
+       state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
+                                        wdata->args.pages,
+                                        wdata->args.pgbase,
+                                        wdata->args.offset,
+                                        wdata->args.count,
+                                        wdata->lseg, wdata,
+                                        GFP_NOFS);
+       if (unlikely(!state)) {
+               status = -ENOMEM;
+               goto out;
+       }
+
+       state->sync = how & FLUSH_SYNC;
+
+       status = objio_write_pagelist(state, how & FLUSH_STABLE);
+ out:
+       dprintk("%s: Return status %Zd\n", __func__, status);
+       wdata->pnfs_error = status;
+       return PNFS_ATTEMPTED;
+}
+
+void
+objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
+                             struct xdr_stream *xdr,
+                             const struct nfs4_layoutcommit_args *args)
+{
+       struct objlayout *objlay = OBJLAYOUT(pnfslay);
+       struct pnfs_osd_layoutupdate lou;
+       __be32 *start;
+
+       dprintk("%s: Begin\n", __func__);
+
+       spin_lock(&objlay->lock);
+       lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
+       lou.dsu_delta = objlay->delta_space_used;
+       objlay->delta_space_used = 0;
+       objlay->delta_space_valid = OBJ_DSU_INIT;
+       lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
+       spin_unlock(&objlay->lock);
+
+       start = xdr_reserve_space(xdr, 4);
+
+       BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
+
+       *start = cpu_to_be32((xdr->p - start - 1) * 4);
+
+       dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
+               lou.dsu_delta, lou.olu_ioerr_flag);
+}
+
+static int
+err_prio(u32 oer_errno)
+{
+       switch (oer_errno) {
+       case 0:
+               return 0;
+
+       case PNFS_OSD_ERR_RESOURCE:
+               return OSD_ERR_PRI_RESOURCE;
+       case PNFS_OSD_ERR_BAD_CRED:
+               return OSD_ERR_PRI_BAD_CRED;
+       case PNFS_OSD_ERR_NO_ACCESS:
+               return OSD_ERR_PRI_NO_ACCESS;
+       case PNFS_OSD_ERR_UNREACHABLE:
+               return OSD_ERR_PRI_UNREACHABLE;
+       case PNFS_OSD_ERR_NOT_FOUND:
+               return OSD_ERR_PRI_NOT_FOUND;
+       case PNFS_OSD_ERR_NO_SPACE:
+               return OSD_ERR_PRI_NO_SPACE;
+       default:
+               WARN_ON(1);
+               /* fallthrough */
+       case PNFS_OSD_ERR_EIO:
+               return OSD_ERR_PRI_EIO;
+       }
+}
+
+static void
+merge_ioerr(struct pnfs_osd_ioerr *dest_err,
+           const struct pnfs_osd_ioerr *src_err)
+{
+       u64 dest_end, src_end;
+
+       if (!dest_err->oer_errno) {
+               *dest_err = *src_err;
+               /* accumulated device must be blank */
+               memset(&dest_err->oer_component.oid_device_id, 0,
+                       sizeof(dest_err->oer_component.oid_device_id));
+
+               return;
+       }
+
+       if (dest_err->oer_component.oid_partition_id !=
+                               src_err->oer_component.oid_partition_id)
+               dest_err->oer_component.oid_partition_id = 0;
+
+       if (dest_err->oer_component.oid_object_id !=
+                               src_err->oer_component.oid_object_id)
+               dest_err->oer_component.oid_object_id = 0;
+
+       if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
+               dest_err->oer_comp_offset = src_err->oer_comp_offset;
+
+       dest_end = end_offset(dest_err->oer_comp_offset,
+                             dest_err->oer_comp_length);
+       src_end =  end_offset(src_err->oer_comp_offset,
+                             src_err->oer_comp_length);
+       if (dest_end < src_end)
+               dest_end = src_end;
+
+       dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
+
+       if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
+           (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
+                       dest_err->oer_errno = src_err->oer_errno;
+       } else if (src_err->oer_iswrite) {
+               dest_err->oer_iswrite = true;
+               dest_err->oer_errno = src_err->oer_errno;
+       }
+}
+
+static void
+encode_accumulated_error(struct objlayout *objlay, __be32 *p)
+{
+       struct objlayout_io_state *state, *tmp;
+       struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
+
+       list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+               unsigned i;
+
+               for (i = 0; i < state->num_comps; i++) {
+                       struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+                       if (!ioerr->oer_errno)
+                               continue;
+
+                       printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
+                               "dev(%llx:%llx) par=0x%llx obj=0x%llx "
+                               "offset=0x%llx length=0x%llx\n",
+                               __func__, i, ioerr->oer_errno,
+                               ioerr->oer_iswrite,
+                               _DEVID_LO(&ioerr->oer_component.oid_device_id),
+                               _DEVID_HI(&ioerr->oer_component.oid_device_id),
+                               ioerr->oer_component.oid_partition_id,
+                               ioerr->oer_component.oid_object_id,
+                               ioerr->oer_comp_offset,
+                               ioerr->oer_comp_length);
+
+                       merge_ioerr(&accumulated_err, ioerr);
+               }
+               list_del(&state->err_list);
+               objlayout_free_io_state(state);
+       }
+
+       pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
+}
+
+void
+objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
+                             struct xdr_stream *xdr,
+                             const struct nfs4_layoutreturn_args *args)
+{
+       struct objlayout *objlay = OBJLAYOUT(pnfslay);
+       struct objlayout_io_state *state, *tmp;
+       __be32 *start;
+
+       dprintk("%s: Begin\n", __func__);
+       start = xdr_reserve_space(xdr, 4);
+       BUG_ON(!start);
+
+       spin_lock(&objlay->lock);
+
+       list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+               __be32 *last_xdr = NULL, *p;
+               unsigned i;
+               int res = 0;
+
+               for (i = 0; i < state->num_comps; i++) {
+                       struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+                       if (!ioerr->oer_errno)
+                               continue;
+
+                       dprintk("%s: err[%d]: errno=%d is_write=%d "
+                               "dev(%llx:%llx) par=0x%llx obj=0x%llx "
+                               "offset=0x%llx length=0x%llx\n",
+                               __func__, i, ioerr->oer_errno,
+                               ioerr->oer_iswrite,
+                               _DEVID_LO(&ioerr->oer_component.oid_device_id),
+                               _DEVID_HI(&ioerr->oer_component.oid_device_id),
+                               ioerr->oer_component.oid_partition_id,
+                               ioerr->oer_component.oid_object_id,
+                               ioerr->oer_comp_offset,
+                               ioerr->oer_comp_length);
+
+                       p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
+                       if (unlikely(!p)) {
+                               res = -E2BIG;
+                               break; /* accumulated_error */
+                       }
+
+                       last_xdr = p;
+                       pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
+               }
+
+               /* TODO: use xdr_write_pages */
+               if (unlikely(res)) {
+                       /* no space for even one error descriptor */
+                       BUG_ON(!last_xdr);
+
+                       /* we've encountered a situation with lots and lots of
+                        * errors and no space to encode them all. Use the last
+                        * available slot to report the union of all the
+                        * remaining errors.
+                        */
+                       encode_accumulated_error(objlay, last_xdr);
+                       goto loop_done;
+               }
+               list_del(&state->err_list);
+               objlayout_free_io_state(state);
+       }
+loop_done:
+       spin_unlock(&objlay->lock);
+
+       *start = cpu_to_be32((xdr->p - start - 1) * 4);
+       dprintk("%s: Return\n", __func__);
+}
+
+
+/*
+ * Get Device Info API for io engines
+ */
+struct objlayout_deviceinfo {
+       struct page *page;
+       struct pnfs_osd_deviceaddr da; /* This must be last */
+};
+
+/* Initialize and call nfs_getdeviceinfo, then decode and return a
+ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
+ * should be called.
+ */
+int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+       struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
+       gfp_t gfp_flags)
+{
+       struct objlayout_deviceinfo *odi;
+       struct pnfs_device pd;
+       struct super_block *sb;
+       struct page *page, **pages;
+       u32 *p;
+       int err;
+
+       page = alloc_page(gfp_flags);
+       if (!page)
+               return -ENOMEM;
+
+       pages = &page;
+       pd.pages = pages;
+
+       memcpy(&pd.dev_id, d_id, sizeof(*d_id));
+       pd.layout_type = LAYOUT_OSD2_OBJECTS;
+       pd.pages = &page;
+       pd.pgbase = 0;
+       pd.pglen = PAGE_SIZE;
+       pd.mincount = 0;
+
+       sb = pnfslay->plh_inode->i_sb;
+       err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+       dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
+       if (err)
+               goto err_out;
+
+       p = page_address(page);
+       odi = kzalloc(sizeof(*odi), gfp_flags);
+       if (!odi) {
+               err = -ENOMEM;
+               goto err_out;
+       }
+       pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
+       odi->page = page;
+       *deviceaddr = &odi->da;
+       return 0;
+
+err_out:
+       __free_page(page);
+       return err;
+}
+
+void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
+{
+       struct objlayout_deviceinfo *odi = container_of(deviceaddr,
+                                               struct objlayout_deviceinfo,
+                                               da);
+
+       __free_page(odi->page);
+       kfree(odi);
+}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
new file mode 100644 (file)
index 0000000..a8244c8
--- /dev/null
@@ -0,0 +1,187 @@
+/*
+ *  Data types and function declerations for interfacing with the
+ *  pNFS standard object layout driver.
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _OBJLAYOUT_H
+#define _OBJLAYOUT_H
+
+#include <linux/nfs_fs.h>
+#include <linux/pnfs_osd_xdr.h>
+#include "../pnfs.h"
+
+/*
+ * per-inode layout
+ */
+struct objlayout {
+       struct pnfs_layout_hdr pnfs_layout;
+
+        /* for layout_commit */
+       enum osd_delta_space_valid_enum {
+               OBJ_DSU_INIT = 0,
+               OBJ_DSU_VALID,
+               OBJ_DSU_INVALID,
+       } delta_space_valid;
+       s64 delta_space_used;  /* consumed by write ops */
+
+        /* for layout_return */
+       spinlock_t lock;
+       struct list_head err_list;
+};
+
+static inline struct objlayout *
+OBJLAYOUT(struct pnfs_layout_hdr *lo)
+{
+       return container_of(lo, struct objlayout, pnfs_layout);
+}
+
+/*
+ * per-I/O operation state
+ * embedded in objects provider io_state data structure
+ */
+struct objlayout_io_state {
+       struct pnfs_layout_segment *lseg;
+
+       struct page **pages;
+       unsigned pgbase;
+       unsigned nr_pages;
+       unsigned long count;
+       loff_t offset;
+       bool sync;
+
+       void *rpcdata;
+       int status;             /* res */
+       int eof;                /* res */
+       int committed;          /* res */
+
+       /* Error reporting (layout_return) */
+       struct list_head err_list;
+       unsigned num_comps;
+       /* Pointer to array of error descriptors of size num_comps.
+        * It should contain as many entries as devices in the osd_layout
+        * that participate in the I/O. It is up to the io_engine to allocate
+        * needed space and set num_comps.
+        */
+       struct pnfs_osd_ioerr *ioerrs;
+};
+
+/*
+ * Raid engine I/O API
+ */
+extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+       struct pnfs_layout_hdr *pnfslay,
+       struct pnfs_layout_range *range,
+       struct xdr_stream *xdr,
+       gfp_t gfp_flags);
+extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
+
+extern int objio_alloc_io_state(
+       struct pnfs_layout_segment *lseg,
+       struct objlayout_io_state **outp,
+       gfp_t gfp_flags);
+extern void objio_free_io_state(struct objlayout_io_state *state);
+
+extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
+extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
+                                   bool stable);
+
+/*
+ * callback API
+ */
+extern void objlayout_io_set_result(struct objlayout_io_state *state,
+                       unsigned index, struct pnfs_osd_objid *pooid,
+                       int osd_error, u64 offset, u64 length, bool is_write);
+
+static inline void
+objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+{
+       struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+
+       /* If one of the I/Os errored out and the delta_space_used was
+        * invalid we render the complete report as invalid. Protocol mandate
+        * the DSU be accurate or not reported.
+        */
+       spin_lock(&objlay->lock);
+       if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
+               objlay->delta_space_valid = OBJ_DSU_VALID;
+               objlay->delta_space_used += space_used;
+       }
+       spin_unlock(&objlay->lock);
+}
+
+extern void objlayout_read_done(struct objlayout_io_state *state,
+                               ssize_t status, bool sync);
+extern void objlayout_write_done(struct objlayout_io_state *state,
+                                ssize_t status, bool sync);
+
+extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+       struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
+       gfp_t gfp_flags);
+extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
+
+/*
+ * exported generic objects function vectors
+ */
+
+extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
+extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
+
+extern struct pnfs_layout_segment *objlayout_alloc_lseg(
+       struct pnfs_layout_hdr *,
+       struct nfs4_layoutget_res *,
+       gfp_t gfp_flags);
+extern void objlayout_free_lseg(struct pnfs_layout_segment *);
+
+extern enum pnfs_try_status objlayout_read_pagelist(
+       struct nfs_read_data *);
+
+extern enum pnfs_try_status objlayout_write_pagelist(
+       struct nfs_write_data *,
+       int how);
+
+extern void objlayout_encode_layoutcommit(
+       struct pnfs_layout_hdr *,
+       struct xdr_stream *,
+       const struct nfs4_layoutcommit_args *);
+
+extern void objlayout_encode_layoutreturn(
+       struct pnfs_layout_hdr *,
+       struct xdr_stream *,
+       const struct nfs4_layoutreturn_args *);
+
+#endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
new file mode 100644 (file)
index 0000000..16fc758
--- /dev/null
@@ -0,0 +1,412 @@
+/*
+ *  Object-Based pNFS Layout XDR layer
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/pnfs_osd_xdr.h>
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+/*
+ * The following implementation is based on RFC5664
+ */
+
+/*
+ * struct pnfs_osd_objid {
+ *     struct nfs4_deviceid    oid_device_id;
+ *     u64                     oid_partition_id;
+ *     u64                     oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static __be32 *
+_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
+{
+       p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
+                                   sizeof(objid->oid_device_id.data));
+
+       p = xdr_decode_hyper(p, &objid->oid_partition_id);
+       p = xdr_decode_hyper(p, &objid->oid_object_id);
+       return p;
+}
+/*
+ * struct pnfs_osd_opaque_cred {
+ *     u32 cred_len;
+ *     void *cred;
+ * }; // xdr size [variable]
+ * The return pointers are from the xdr buffer
+ */
+static int
+_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
+                           struct xdr_stream *xdr)
+{
+       __be32 *p = xdr_inline_decode(xdr, 1);
+
+       if (!p)
+               return -EINVAL;
+
+       opaque_cred->cred_len = be32_to_cpu(*p++);
+
+       p = xdr_inline_decode(xdr, opaque_cred->cred_len);
+       if (!p)
+               return -EINVAL;
+
+       opaque_cred->cred = p;
+       return 0;
+}
+
+/*
+ * struct pnfs_osd_object_cred {
+ *     struct pnfs_osd_objid           oc_object_id;
+ *     u32                             oc_osd_version;
+ *     u32                             oc_cap_key_sec;
+ *     struct pnfs_osd_opaque_cred     oc_cap_key
+ *     struct pnfs_osd_opaque_cred     oc_cap;
+ * }; // xdr size 32 + 4 + 4 + [variable] + [variable]
+ */
+static int
+_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
+                           struct xdr_stream *xdr)
+{
+       __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
+       int ret;
+
+       if (!p)
+               return -EIO;
+
+       p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+       comp->oc_osd_version = be32_to_cpup(p++);
+       comp->oc_cap_key_sec = be32_to_cpup(p);
+
+       ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
+       if (unlikely(ret))
+               return ret;
+
+       ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
+       return ret;
+}
+
+/*
+ * struct pnfs_osd_data_map {
+ *     u32     odm_num_comps;
+ *     u64     odm_stripe_unit;
+ *     u32     odm_group_width;
+ *     u32     odm_group_depth;
+ *     u32     odm_mirror_cnt;
+ *     u32     odm_raid_algorithm;
+ * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
+ */
+static inline int
+_osd_data_map_xdr_sz(void)
+{
+       return 4 + 8 + 4 + 4 + 4 + 4;
+}
+
+static __be32 *
+_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
+{
+       data_map->odm_num_comps = be32_to_cpup(p++);
+       p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
+       data_map->odm_group_width = be32_to_cpup(p++);
+       data_map->odm_group_depth = be32_to_cpup(p++);
+       data_map->odm_mirror_cnt = be32_to_cpup(p++);
+       data_map->odm_raid_algorithm = be32_to_cpup(p++);
+       dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
+               "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
+               __func__,
+               data_map->odm_num_comps,
+               (unsigned long long)data_map->odm_stripe_unit,
+               data_map->odm_group_width,
+               data_map->odm_group_depth,
+               data_map->odm_mirror_cnt,
+               data_map->odm_raid_algorithm);
+       return p;
+}
+
+int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
+       struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
+{
+       __be32 *p;
+
+       memset(iter, 0, sizeof(*iter));
+
+       p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
+       if (unlikely(!p))
+               return -EINVAL;
+
+       p = _osd_xdr_decode_data_map(p, &layout->olo_map);
+       layout->olo_comps_index = be32_to_cpup(p++);
+       layout->olo_num_comps = be32_to_cpup(p++);
+       iter->total_comps = layout->olo_num_comps;
+       return 0;
+}
+
+bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
+       struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
+       int *err)
+{
+       BUG_ON(iter->decoded_comps > iter->total_comps);
+       if (iter->decoded_comps == iter->total_comps)
+               return false;
+
+       *err = _osd_xdr_decode_object_cred(comp, xdr);
+       if (unlikely(*err)) {
+               dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
+                       "total_comps=%d\n", __func__, *err,
+                       iter->decoded_comps, iter->total_comps);
+               return false; /* stop the loop */
+       }
+       dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
+               "key_len=%u cap_len=%u\n",
+               __func__,
+               _DEVID_LO(&comp->oc_object_id.oid_device_id),
+               _DEVID_HI(&comp->oc_object_id.oid_device_id),
+               comp->oc_object_id.oid_partition_id,
+               comp->oc_object_id.oid_object_id,
+               comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
+
+       iter->decoded_comps++;
+       return true;
+}
+
+/*
+ * Get Device Information Decoding
+ *
+ * Note: since Device Information is currently done synchronously, all
+ *       variable strings fields are left inside the rpc buffer and are only
+ *       pointed to by the pnfs_osd_deviceaddr members. So the read buffer
+ *       should not be freed while the returned information is in use.
+ */
+/*
+ *struct nfs4_string {
+ *     unsigned int len;
+ *     char *data;
+ *}; // size [variable]
+ * NOTE: Returned string points to inside the XDR buffer
+ */
+static __be32 *
+__read_u8_opaque(__be32 *p, struct nfs4_string *str)
+{
+       str->len = be32_to_cpup(p++);
+       str->data = (char *)p;
+
+       p += XDR_QUADLEN(str->len);
+       return p;
+}
+
+/*
+ * struct pnfs_osd_targetid {
+ *     u32                     oti_type;
+ *     struct nfs4_string      oti_scsi_device_id;
+ * };// size 4 + [variable]
+ */
+static __be32 *
+__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
+{
+       u32 oti_type;
+
+       oti_type = be32_to_cpup(p++);
+       targetid->oti_type = oti_type;
+
+       switch (oti_type) {
+       case OBJ_TARGET_SCSI_NAME:
+       case OBJ_TARGET_SCSI_DEVICE_ID:
+               p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
+       }
+
+       return p;
+}
+
+/*
+ * struct pnfs_osd_net_addr {
+ *     struct nfs4_string      r_netid;
+ *     struct nfs4_string      r_addr;
+ * };
+ */
+static __be32 *
+__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
+{
+       p = __read_u8_opaque(p, &netaddr->r_netid);
+       p = __read_u8_opaque(p, &netaddr->r_addr);
+
+       return p;
+}
+
+/*
+ * struct pnfs_osd_targetaddr {
+ *     u32                             ota_available;
+ *     struct pnfs_osd_net_addr        ota_netaddr;
+ * };
+ */
+static __be32 *
+__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
+{
+       u32 ota_available;
+
+       ota_available = be32_to_cpup(p++);
+       targetaddr->ota_available = ota_available;
+
+       if (ota_available)
+               p = __read_net_addr(p, &targetaddr->ota_netaddr);
+
+
+       return p;
+}
+
+/*
+ * struct pnfs_osd_deviceaddr {
+ *     struct pnfs_osd_targetid        oda_targetid;
+ *     struct pnfs_osd_targetaddr      oda_targetaddr;
+ *     u8                              oda_lun[8];
+ *     struct nfs4_string              oda_systemid;
+ *     struct pnfs_osd_object_cred     oda_root_obj_cred;
+ *     struct nfs4_string              oda_osdname;
+ * };
+ */
+
+/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
+ * not have an xdr_stream
+ */
+static __be32 *
+__read_opaque_cred(__be32 *p,
+                             struct pnfs_osd_opaque_cred *opaque_cred)
+{
+       opaque_cred->cred_len = be32_to_cpu(*p++);
+       opaque_cred->cred = p;
+       return p + XDR_QUADLEN(opaque_cred->cred_len);
+}
+
+static __be32 *
+__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
+{
+       p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+       comp->oc_osd_version = be32_to_cpup(p++);
+       comp->oc_cap_key_sec = be32_to_cpup(p++);
+
+       p = __read_opaque_cred(p, &comp->oc_cap_key);
+       p = __read_opaque_cred(p, &comp->oc_cap);
+       return p;
+}
+
+void pnfs_osd_xdr_decode_deviceaddr(
+       struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
+{
+       p = __read_targetid(p, &deviceaddr->oda_targetid);
+
+       p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
+
+       p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
+                                   sizeof(deviceaddr->oda_lun));
+
+       p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
+
+       p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
+
+       p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
+
+       /* libosd likes this terminated in dbg. It's last, so no problems */
+       deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
+}
+
+/*
+ * struct pnfs_osd_layoutupdate {
+ *     u32     dsu_valid;
+ *     s64     dsu_delta;
+ *     u32     olu_ioerr_flag;
+ * }; xdr size 4 + 8 + 4
+ */
+int
+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
+                                struct pnfs_osd_layoutupdate *lou)
+{
+       __be32 *p = xdr_reserve_space(xdr,  4 + 8 + 4);
+
+       if (!p)
+               return -E2BIG;
+
+       *p++ = cpu_to_be32(lou->dsu_valid);
+       if (lou->dsu_valid)
+               p = xdr_encode_hyper(p, lou->dsu_delta);
+       *p++ = cpu_to_be32(lou->olu_ioerr_flag);
+       return 0;
+}
+
+/*
+ * struct pnfs_osd_objid {
+ *     struct nfs4_deviceid    oid_device_id;
+ *     u64                     oid_partition_id;
+ *     u64                     oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static inline __be32 *
+pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
+{
+       p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
+                                   sizeof(object_id->oid_device_id.data));
+       p = xdr_encode_hyper(p, object_id->oid_partition_id);
+       p = xdr_encode_hyper(p, object_id->oid_object_id);
+
+       return p;
+}
+
+/*
+ * struct pnfs_osd_ioerr {
+ *     struct pnfs_osd_objid   oer_component;
+ *     u64                     oer_comp_offset;
+ *     u64                     oer_comp_length;
+ *     u32                     oer_iswrite;
+ *     u32                     oer_errno;
+ * }; // xdr size 32 + 24 bytes
+ */
+void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
+{
+       p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
+       p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
+       p = xdr_encode_hyper(p, ioerr->oer_comp_length);
+       *p++ = cpu_to_be32(ioerr->oer_iswrite);
+       *p   = cpu_to_be32(ioerr->oer_errno);
+}
+
+__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
+{
+       __be32 *p;
+
+       p = xdr_reserve_space(xdr, 32 + 24);
+       if (unlikely(!p))
+               dprintk("%s: out of xdr space\n", __func__);
+
+       return p;
+}
index c80add6..7913961 100644 (file)
@@ -204,6 +204,21 @@ nfs_wait_on_request(struct nfs_page *req)
                        TASK_UNINTERRUPTIBLE);
 }
 
+static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
+{
+       /*
+        * FIXME: ideally we should be able to coalesce all requests
+        * that are not block boundary aligned, but currently this
+        * is problematic for the case of bsize < PAGE_CACHE_SIZE,
+        * since nfs_flush_multi and nfs_pagein_multi assume you
+        * can have only one struct nfs_page.
+        */
+       if (desc->pg_bsize < PAGE_SIZE)
+               return 0;
+
+       return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
+}
+
 /**
  * nfs_pageio_init - initialise a page io descriptor
  * @desc: pointer to descriptor
@@ -229,6 +244,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
        desc->pg_ioflags = io_flags;
        desc->pg_error = 0;
        desc->pg_lseg = NULL;
+       desc->pg_test = nfs_generic_pg_test;
+       pnfs_pageio_init(desc, inode);
 }
 
 /**
@@ -242,29 +259,23 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
  *
  * Return 'true' if this is the case, else return 'false'.
  */
-static int nfs_can_coalesce_requests(struct nfs_page *prev,
-                                    struct nfs_page *req,
-                                    struct nfs_pageio_descriptor *pgio)
+static bool nfs_can_coalesce_requests(struct nfs_page *prev,
+                                     struct nfs_page *req,
+                                     struct nfs_pageio_descriptor *pgio)
 {
        if (req->wb_context->cred != prev->wb_context->cred)
-               return 0;
+               return false;
        if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
-               return 0;
+               return false;
        if (req->wb_context->state != prev->wb_context->state)
-               return 0;
+               return false;
        if (req->wb_index != (prev->wb_index + 1))
-               return 0;
+               return false;
        if (req->wb_pgbase != 0)
-               return 0;
+               return false;
        if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
-               return 0;
-       /*
-        * Non-whole file layouts need to check that req is inside of
-        * pgio->pg_lseg.
-        */
-       if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
-               return 0;
-       return 1;
+               return false;
+       return pgio->pg_test(pgio, prev, req);
 }
 
 /**
@@ -278,31 +289,18 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
 static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
                                     struct nfs_page *req)
 {
-       size_t newlen = req->wb_bytes;
-
        if (desc->pg_count != 0) {
                struct nfs_page *prev;
 
-               /*
-                * FIXME: ideally we should be able to coalesce all requests
-                * that are not block boundary aligned, but currently this
-                * is problematic for the case of bsize < PAGE_CACHE_SIZE,
-                * since nfs_flush_multi and nfs_pagein_multi assume you
-                * can have only one struct nfs_page.
-                */
-               if (desc->pg_bsize < PAGE_SIZE)
-                       return 0;
-               newlen += desc->pg_count;
-               if (newlen > desc->pg_bsize)
-                       return 0;
                prev = nfs_list_entry(desc->pg_list.prev);
                if (!nfs_can_coalesce_requests(prev, req, desc))
                        return 0;
-       } else
+       } else {
                desc->pg_base = req->wb_pgbase;
+       }
        nfs_list_remove_request(req);
        nfs_list_add_request(req, &desc->pg_list);
-       desc->pg_count = newlen;
+       desc->pg_count += req->wb_bytes;
        return 1;
 }
 
index 101c85a..8c1309d 100644 (file)
@@ -177,13 +177,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo)
        atomic_inc(&lo->plh_refcount);
 }
 
+static struct pnfs_layout_hdr *
+pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
+{
+       struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+       return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
+               kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
+}
+
+static void
+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+       struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
+       return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
+}
+
 static void
 destroy_layout_hdr(struct pnfs_layout_hdr *lo)
 {
        dprintk("%s: freeing layout cache %p\n", __func__, lo);
        BUG_ON(!list_empty(&lo->plh_layouts));
        NFS_I(lo->plh_inode)->layout = NULL;
-       kfree(lo);
+       pnfs_free_layout_hdr(lo);
 }
 
 static void
@@ -228,7 +243,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg)
 {
        struct inode *inode = lseg->pls_layout->plh_inode;
 
-       BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+       WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
        list_del_init(&lseg->pls_list);
        if (list_empty(&lseg->pls_layout->plh_segs)) {
                set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
@@ -261,11 +276,72 @@ put_lseg(struct pnfs_layout_segment *lseg)
 }
 EXPORT_SYMBOL_GPL(put_lseg);
 
+static inline u64
+end_offset(u64 start, u64 len)
+{
+       u64 end;
+
+       end = start + len;
+       return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+       u64 end;
+
+       BUG_ON(!len);
+       end = start + len;
+       return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
+/*
+ * is l2 fully contained in l1?
+ *   start1                             end1
+ *   [----------------------------------)
+ *           start2           end2
+ *           [----------------)
+ */
+static inline int
+lo_seg_contained(struct pnfs_layout_range *l1,
+                struct pnfs_layout_range *l2)
+{
+       u64 start1 = l1->offset;
+       u64 end1 = end_offset(start1, l1->length);
+       u64 start2 = l2->offset;
+       u64 end2 = end_offset(start2, l2->length);
+
+       return (start1 <= start2) && (end1 >= end2);
+}
+
+/*
+ * is l1 and l2 intersecting?
+ *   start1                             end1
+ *   [----------------------------------)
+ *                              start2           end2
+ *                              [----------------)
+ */
+static inline int
+lo_seg_intersecting(struct pnfs_layout_range *l1,
+                   struct pnfs_layout_range *l2)
+{
+       u64 start1 = l1->offset;
+       u64 end1 = end_offset(start1, l1->length);
+       u64 start2 = l2->offset;
+       u64 end2 = end_offset(start2, l2->length);
+
+       return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
+              (end2 == NFS4_MAX_UINT64 || end2 > start1);
+}
+
 static bool
-should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
+should_free_lseg(struct pnfs_layout_range *lseg_range,
+                struct pnfs_layout_range *recall_range)
 {
-       return (recall_iomode == IOMODE_ANY ||
-               lseg_iomode == recall_iomode);
+       return (recall_range->iomode == IOMODE_ANY ||
+               lseg_range->iomode == recall_range->iomode) &&
+              lo_seg_intersecting(lseg_range, recall_range);
 }
 
 /* Returns 1 if lseg is removed from list, 0 otherwise */
@@ -296,7 +372,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 int
 mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                            struct list_head *tmp_list,
-                           u32 iomode)
+                           struct pnfs_layout_range *recall_range)
 {
        struct pnfs_layout_segment *lseg, *next;
        int invalid = 0, removed = 0;
@@ -309,7 +385,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                return 0;
        }
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
-               if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
+               if (!recall_range ||
+                   should_free_lseg(&lseg->pls_range, recall_range)) {
                        dprintk("%s: freeing lseg %p iomode %d "
                                "offset %llu length %llu\n", __func__,
                                lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
@@ -358,7 +435,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
        lo = nfsi->layout;
        if (lo) {
                lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
-               mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
+               mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
        }
        spin_unlock(&nfsi->vfs_inode.i_lock);
        pnfs_free_lseg_list(&tmp_list);
@@ -467,7 +544,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 static struct pnfs_layout_segment *
 send_layoutget(struct pnfs_layout_hdr *lo,
           struct nfs_open_context *ctx,
-          u32 iomode,
+          struct pnfs_layout_range *range,
           gfp_t gfp_flags)
 {
        struct inode *ino = lo->plh_inode;
@@ -499,11 +576,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
                        goto out_err_free;
        }
 
-       lgp->args.minlength = NFS4_MAX_UINT64;
+       lgp->args.minlength = PAGE_CACHE_SIZE;
+       if (lgp->args.minlength > range->length)
+               lgp->args.minlength = range->length;
        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
-       lgp->args.range.iomode = iomode;
-       lgp->args.range.offset = 0;
-       lgp->args.range.length = NFS4_MAX_UINT64;
+       lgp->args.range = *range;
        lgp->args.type = server->pnfs_curr_ld->id;
        lgp->args.inode = ino;
        lgp->args.ctx = get_nfs_open_context(ctx);
@@ -518,7 +595,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        nfs4_proc_layoutget(lgp);
        if (!lseg) {
                /* remember that LAYOUTGET failed and suspend trying */
-               set_bit(lo_fail_bit(iomode), &lo->plh_flags);
+               set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
        }
 
        /* free xdr pages */
@@ -542,6 +619,51 @@ out_err_free:
        return NULL;
 }
 
+/* Initiates a LAYOUTRETURN(FILE) */
+int
+_pnfs_return_layout(struct inode *ino)
+{
+       struct pnfs_layout_hdr *lo = NULL;
+       struct nfs_inode *nfsi = NFS_I(ino);
+       LIST_HEAD(tmp_list);
+       struct nfs4_layoutreturn *lrp;
+       nfs4_stateid stateid;
+       int status = 0;
+
+       dprintk("--> %s\n", __func__);
+
+       spin_lock(&ino->i_lock);
+       lo = nfsi->layout;
+       if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) {
+               spin_unlock(&ino->i_lock);
+               dprintk("%s: no layout segments to return\n", __func__);
+               goto out;
+       }
+       stateid = nfsi->layout->plh_stateid;
+       /* Reference matched in nfs4_layoutreturn_release */
+       get_layout_hdr(lo);
+       spin_unlock(&ino->i_lock);
+       pnfs_free_lseg_list(&tmp_list);
+
+       WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
+
+       lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+       if (unlikely(lrp == NULL)) {
+               status = -ENOMEM;
+               goto out;
+       }
+
+       lrp->args.stateid = stateid;
+       lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
+       lrp->args.inode = ino;
+       lrp->clp = NFS_SERVER(ino)->nfs_client;
+
+       status = nfs4_proc_layoutreturn(lrp);
+out:
+       dprintk("<-- %s status: %d\n", __func__, status);
+       return status;
+}
+
 bool pnfs_roc(struct inode *ino)
 {
        struct pnfs_layout_hdr *lo;
@@ -625,10 +747,23 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
  * are seen first.
  */
 static s64
-cmp_layout(u32 iomode1, u32 iomode2)
+cmp_layout(struct pnfs_layout_range *l1,
+          struct pnfs_layout_range *l2)
 {
+       s64 d;
+
+       /* high offset > low offset */
+       d = l1->offset - l2->offset;
+       if (d)
+               return d;
+
+       /* short length > long length */
+       d = l2->length - l1->length;
+       if (d)
+               return d;
+
        /* read > read/write */
-       return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
+       return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
 }
 
 static void
@@ -636,13 +771,12 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
                   struct pnfs_layout_segment *lseg)
 {
        struct pnfs_layout_segment *lp;
-       int found = 0;
 
        dprintk("%s:Begin\n", __func__);
 
        assert_spin_locked(&lo->plh_inode->i_lock);
        list_for_each_entry(lp, &lo->plh_segs, pls_list) {
-               if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
+               if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
                        continue;
                list_add_tail(&lseg->pls_list, &lp->pls_list);
                dprintk("%s: inserted lseg %p "
@@ -652,16 +786,14 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
                        lseg->pls_range.offset, lseg->pls_range.length,
                        lp, lp->pls_range.iomode, lp->pls_range.offset,
                        lp->pls_range.length);
-               found = 1;
-               break;
-       }
-       if (!found) {
-               list_add_tail(&lseg->pls_list, &lo->plh_segs);
-               dprintk("%s: inserted lseg %p "
-                       "iomode %d offset %llu length %llu at tail\n",
-                       __func__, lseg, lseg->pls_range.iomode,
-                       lseg->pls_range.offset, lseg->pls_range.length);
+               goto out;
        }
+       list_add_tail(&lseg->pls_list, &lo->plh_segs);
+       dprintk("%s: inserted lseg %p "
+               "iomode %d offset %llu length %llu at tail\n",
+               __func__, lseg, lseg->pls_range.iomode,
+               lseg->pls_range.offset, lseg->pls_range.length);
+out:
        get_layout_hdr(lo);
 
        dprintk("%s:Return\n", __func__);
@@ -672,7 +804,7 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
 {
        struct pnfs_layout_hdr *lo;
 
-       lo = kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
+       lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
        if (!lo)
                return NULL;
        atomic_set(&lo->plh_refcount, 1);
@@ -705,7 +837,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
        if (likely(nfsi->layout == NULL))       /* Won the race? */
                nfsi->layout = new;
        else
-               kfree(new);
+               pnfs_free_layout_hdr(new);
        return nfsi->layout;
 }
 
@@ -721,16 +853,28 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
  * READ                RW      true
  */
 static int
-is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
+is_matching_lseg(struct pnfs_layout_range *ls_range,
+                struct pnfs_layout_range *range)
 {
-       return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
+       struct pnfs_layout_range range1;
+
+       if ((range->iomode == IOMODE_RW &&
+            ls_range->iomode != IOMODE_RW) ||
+           !lo_seg_intersecting(ls_range, range))
+               return 0;
+
+       /* range1 covers only the first byte in the range */
+       range1 = *range;
+       range1.length = 1;
+       return lo_seg_contained(ls_range, &range1);
 }
 
 /*
  * lookup range in layout
  */
 static struct pnfs_layout_segment *
-pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo,
+               struct pnfs_layout_range *range)
 {
        struct pnfs_layout_segment *lseg, *ret = NULL;
 
@@ -739,11 +883,11 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
        assert_spin_locked(&lo->plh_inode->i_lock);
        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
-                   is_matching_lseg(lseg, iomode)) {
+                   is_matching_lseg(&lseg->pls_range, range)) {
                        ret = get_lseg(lseg);
                        break;
                }
-               if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
+               if (cmp_layout(range, &lseg->pls_range) > 0)
                        break;
        }
 
@@ -759,9 +903,17 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino,
                   struct nfs_open_context *ctx,
+                  loff_t pos,
+                  u64 count,
                   enum pnfs_iomode iomode,
                   gfp_t gfp_flags)
 {
+       struct pnfs_layout_range arg = {
+               .iomode = iomode,
+               .offset = pos,
+               .length = count,
+       };
+       unsigned pg_offset;
        struct nfs_inode *nfsi = NFS_I(ino);
        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        struct pnfs_layout_hdr *lo;
@@ -789,7 +941,7 @@ pnfs_update_layout(struct inode *ino,
                goto out_unlock;
 
        /* Check to see if the layout for the given range already exists */
-       lseg = pnfs_find_lseg(lo, iomode);
+       lseg = pnfs_find_lseg(lo, &arg);
        if (lseg)
                goto out_unlock;
 
@@ -811,7 +963,14 @@ pnfs_update_layout(struct inode *ino,
                spin_unlock(&clp->cl_lock);
        }
 
-       lseg = send_layoutget(lo, ctx, iomode, gfp_flags);
+       pg_offset = arg.offset & ~PAGE_CACHE_MASK;
+       if (pg_offset) {
+               arg.offset -= pg_offset;
+               arg.length += pg_offset;
+       }
+       arg.length = PAGE_CACHE_ALIGN(arg.length);
+
+       lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
        if (!lseg && first) {
                spin_lock(&clp->cl_lock);
                list_del_init(&lo->plh_layouts);
@@ -838,17 +997,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        int status = 0;
 
-       /* Verify we got what we asked for.
-        * Note that because the xdr parsing only accepts a single
-        * element array, this can fail even if the server is behaving
-        * correctly.
-        */
-       if (lgp->args.range.iomode > res->range.iomode ||
-           res->range.offset != 0 ||
-           res->range.length != NFS4_MAX_UINT64) {
-               status = -EINVAL;
-               goto out;
-       }
        /* Inject layout blob into I/O device driver */
        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
        if (!lseg || IS_ERR(lseg)) {
@@ -895,51 +1043,64 @@ out_forget_reply:
        goto out;
 }
 
-static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
-                            struct nfs_page *prev,
-                            struct nfs_page *req)
+bool
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+                    struct nfs_page *req)
 {
+       enum pnfs_iomode access_type;
+       gfp_t gfp_flags;
+
+       /* We assume that pg_ioflags == 0 iff we're reading a page */
+       if (pgio->pg_ioflags == 0) {
+               access_type = IOMODE_READ;
+               gfp_flags = GFP_KERNEL;
+       } else {
+               access_type = IOMODE_RW;
+               gfp_flags = GFP_NOFS;
+       }
+
        if (pgio->pg_count == prev->wb_bytes) {
                /* This is first coelesce call for a series of nfs_pages */
                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                                   prev->wb_context,
-                                                  IOMODE_READ,
-                                                  GFP_KERNEL);
+                                                  req_offset(req),
+                                                  pgio->pg_count,
+                                                  access_type,
+                                                  gfp_flags);
+               return true;
        }
-       return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
-}
 
-void
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
-{
-       struct pnfs_layoutdriver_type *ld;
+       if (pgio->pg_lseg &&
+           req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
+                                        pgio->pg_lseg->pls_range.length))
+               return false;
 
-       ld = NFS_SERVER(inode)->pnfs_curr_ld;
-       pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
+       return true;
 }
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 
-static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
-                             struct nfs_page *prev,
-                             struct nfs_page *req)
+/*
+ * Called by non rpc-based layout drivers
+ */
+int
+pnfs_ld_write_done(struct nfs_write_data *data)
 {
-       if (pgio->pg_count == prev->wb_bytes) {
-               /* This is first coelesce call for a series of nfs_pages */
-               pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-                                                  prev->wb_context,
-                                                  IOMODE_RW,
-                                                  GFP_NOFS);
-       }
-       return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
-}
+       int status;
 
-void
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
-{
-       struct pnfs_layoutdriver_type *ld;
+       if (!data->pnfs_error) {
+               pnfs_set_layoutcommit(data);
+               data->mds_ops->rpc_call_done(&data->task, data);
+               data->mds_ops->rpc_release(data);
+               return 0;
+       }
 
-       ld = NFS_SERVER(inode)->pnfs_curr_ld;
-       pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
+       dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
+               data->pnfs_error);
+       status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
+                                   data->mds_ops, NFS_FILE_SYNC);
+       return status ? : -EAGAIN;
 }
+EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
 
 enum pnfs_try_status
 pnfs_try_to_write_data(struct nfs_write_data *wdata,
@@ -965,6 +1126,29 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
        return trypnfs;
 }
 
+/*
+ * Called by non rpc-based layout drivers
+ */
+int
+pnfs_ld_read_done(struct nfs_read_data *data)
+{
+       int status;
+
+       if (!data->pnfs_error) {
+               __nfs4_read_done_cb(data);
+               data->mds_ops->rpc_call_done(&data->task, data);
+               data->mds_ops->rpc_release(data);
+               return 0;
+       }
+
+       dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
+               data->pnfs_error);
+       status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
+                                  data->mds_ops);
+       return status ? : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
+
 /*
  * Call the appropriate parallel I/O subsystem read function.
  */
index 0c015ba..48d0a8e 100644 (file)
@@ -30,6 +30,7 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
 
+#include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 
 enum {
@@ -64,17 +65,29 @@ enum {
        NFS_LAYOUT_DESTROYED,           /* no new use of layout allowed */
 };
 
+enum layoutdriver_policy_flags {
+       /* Should the pNFS client commit and return the layout upon a setattr */
+       PNFS_LAYOUTRET_ON_SETATTR       = 1 << 0,
+};
+
+struct nfs4_deviceid_node;
+
 /* Per-layout driver specific registration structure */
 struct pnfs_layoutdriver_type {
        struct list_head pnfs_tblid;
        const u32 id;
        const char *name;
        struct module *owner;
+       unsigned flags;
+
+       struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
+       void (*free_layout_hdr) (struct pnfs_layout_hdr *);
+
        struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
        void (*free_lseg) (struct pnfs_layout_segment *lseg);
 
        /* test for nfs page cache coalescing */
-       int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+       bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
 
        /* Returns true if layoutdriver wants to divert this request to
         * driver's commit routine.
@@ -89,6 +102,16 @@ struct pnfs_layoutdriver_type {
         */
        enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
        enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
+
+       void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+
+       void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
+                                    struct xdr_stream *xdr,
+                                    const struct nfs4_layoutreturn_args *args);
+
+       void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+                                    struct xdr_stream *xdr,
+                                    const struct nfs4_layoutcommit_args *args);
 };
 
 struct pnfs_layout_hdr {
@@ -120,21 +143,22 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
                                   struct pnfs_device *dev);
 extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 
 /* pnfs.c */
 void get_layout_hdr(struct pnfs_layout_hdr *lo);
 void put_lseg(struct pnfs_layout_segment *lseg);
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
-                  enum pnfs_iomode access_type, gfp_t gfp_flags);
+                  loff_t pos, u64 count, enum pnfs_iomode access_type,
+                  gfp_t gfp_flags);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
                                             const struct rpc_call_ops *, int);
 enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
                                            const struct rpc_call_ops *);
-void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
-void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
+bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
@@ -148,13 +172,37 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
                                  struct nfs4_state *open_state);
 int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
-                               u32 iomode);
+                               struct pnfs_layout_range *recall_range);
 bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
+int _pnfs_return_layout(struct inode *);
+int pnfs_ld_write_done(struct nfs_write_data *);
+int pnfs_ld_read_done(struct nfs_read_data *);
+
+/* pnfs_dev.c */
+struct nfs4_deviceid_node {
+       struct hlist_node               node;
+       const struct pnfs_layoutdriver_type *ld;
+       const struct nfs_client         *nfs_client;
+       struct nfs4_deviceid            deviceid;
+       atomic_t                        ref;
+};
+
+void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
+struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
+                            const struct pnfs_layoutdriver_type *,
+                            const struct nfs_client *,
+                            const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
+bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_deviceid_purge_client(const struct nfs_client *);
 
 static inline int lo_fail_bit(u32 iomode)
 {
@@ -223,6 +271,36 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
                put_lseg(req->wb_commit_lseg);
 }
 
+/* Should the pNFS client commit and return the layout upon a setattr */
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+       if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+               return false;
+       return NFS_SERVER(inode)->pnfs_curr_ld->flags &
+               PNFS_LAYOUTRET_ON_SETATTR;
+}
+
+static inline int pnfs_return_layout(struct inode *ino)
+{
+       struct nfs_inode *nfsi = NFS_I(ino);
+       struct nfs_server *nfss = NFS_SERVER(ino);
+
+       if (pnfs_enabled_sb(nfss) && nfsi->layout)
+               return _pnfs_return_layout(ino);
+
+       return 0;
+}
+
+static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
+                                   struct inode *inode)
+{
+       struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+       if (ld)
+               pgio->pg_test = ld->pg_test;
+}
+
 #else  /* CONFIG_NFS_V4_1 */
 
 static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -245,7 +323,8 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)
 
 static inline struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
-                  enum pnfs_iomode access_type, gfp_t gfp_flags)
+                  loff_t pos, u64 count, enum pnfs_iomode access_type,
+                  gfp_t gfp_flags)
 {
        return NULL;
 }
@@ -264,6 +343,17 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
        return PNFS_NOT_ATTEMPTED;
 }
 
+static inline int pnfs_return_layout(struct inode *ino)
+{
+       return 0;
+}
+
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+       return false;
+}
+
 static inline bool
 pnfs_roc(struct inode *ino)
 {
@@ -294,16 +384,9 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
 {
 }
 
-static inline void
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
-{
-       pgio->pg_test = NULL;
-}
-
-static inline void
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
+                                   struct inode *inode)
 {
-       pgio->pg_test = NULL;
 }
 
 static inline void
@@ -331,6 +414,10 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
        return 0;
 }
+
+static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
new file mode 100644 (file)
index 0000000..c65e133
--- /dev/null
@@ -0,0 +1,270 @@
+/*
+ *  Device operations for the pnfs client.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY                NFSDBG_PNFS
+
+/*
+ * Device ID RCU cache. A device ID is unique per server and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS       5
+#define NFS4_DEVICE_ID_HASH_SIZE       (1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK       (NFS4_DEVICE_ID_HASH_SIZE - 1)
+
+static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfs4_deviceid_lock);
+
+void
+nfs4_print_deviceid(const struct nfs4_deviceid *id)
+{
+       u32 *p = (u32 *)id;
+
+       dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+               p[0], p[1], p[2], p[3]);
+}
+EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
+
+static inline u32
+nfs4_deviceid_hash(const struct nfs4_deviceid *id)
+{
+       unsigned char *cptr = (unsigned char *)id->data;
+       unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+       u32 x = 0;
+
+       while (nbytes--) {
+               x *= 37;
+               x += *cptr++;
+       }
+       return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
+static struct nfs4_deviceid_node *
+_lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
+                const struct nfs_client *clp, const struct nfs4_deviceid *id,
+                long hash)
+{
+       struct nfs4_deviceid_node *d;
+       struct hlist_node *n;
+
+       hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+               if (d->ld == ld && d->nfs_client == clp &&
+                   !memcmp(&d->deviceid, id, sizeof(*id))) {
+                       if (atomic_read(&d->ref))
+                               return d;
+                       else
+                               continue;
+               }
+       return NULL;
+}
+
+/*
+ * Lookup a deviceid in cache and get a reference count on it if found
+ *
+ * @clp nfs_client associated with deviceid
+ * @id deviceid to look up
+ */
+struct nfs4_deviceid_node *
+_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+                  const struct nfs_client *clp, const struct nfs4_deviceid *id,
+                  long hash)
+{
+       struct nfs4_deviceid_node *d;
+
+       rcu_read_lock();
+       d = _lookup_deviceid(ld, clp, id, hash);
+       if (d && !atomic_inc_not_zero(&d->ref))
+               d = NULL;
+       rcu_read_unlock();
+       return d;
+}
+
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+                      const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+       return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+}
+EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
+
+/*
+ * Unhash and put deviceid
+ *
+ * @clp nfs_client associated with deviceid
+ * @id the deviceid to unhash
+ *
+ * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
+ */
+struct nfs4_deviceid_node *
+nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
+                        const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+       struct nfs4_deviceid_node *d;
+
+       spin_lock(&nfs4_deviceid_lock);
+       rcu_read_lock();
+       d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+       rcu_read_unlock();
+       if (!d) {
+               spin_unlock(&nfs4_deviceid_lock);
+               return NULL;
+       }
+       hlist_del_init_rcu(&d->node);
+       spin_unlock(&nfs4_deviceid_lock);
+       synchronize_rcu();
+
+       /* balance the initial ref set in pnfs_insert_deviceid */
+       if (atomic_dec_and_test(&d->ref))
+               return d;
+
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
+
+/*
+ * Delete a deviceid from cache
+ *
+ * @clp struct nfs_client qualifying the deviceid
+ * @id deviceid to delete
+ */
+void
+nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
+                    const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+       struct nfs4_deviceid_node *d;
+
+       d = nfs4_unhash_put_deviceid(ld, clp, id);
+       if (!d)
+               return;
+       d->ld->free_deviceid_node(d);
+}
+EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
+                       const struct pnfs_layoutdriver_type *ld,
+                       const struct nfs_client *nfs_client,
+                       const struct nfs4_deviceid *id)
+{
+       INIT_HLIST_NODE(&d->node);
+       d->ld = ld;
+       d->nfs_client = nfs_client;
+       d->deviceid = *id;
+       atomic_set(&d->ref, 1);
+}
+EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
+
+/*
+ * Uniquely initialize and insert a deviceid node into cache
+ *
+ * @new new deviceid node
+ *      Note that the caller must set up the following members:
+ *        new->ld
+ *        new->nfs_client
+ *        new->deviceid
+ *
+ * @ret the inserted node, if none found, otherwise, the found entry.
+ */
+struct nfs4_deviceid_node *
+nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
+{
+       struct nfs4_deviceid_node *d;
+       long hash;
+
+       spin_lock(&nfs4_deviceid_lock);
+       hash = nfs4_deviceid_hash(&new->deviceid);
+       d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
+       if (d) {
+               spin_unlock(&nfs4_deviceid_lock);
+               return d;
+       }
+
+       hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+       spin_unlock(&nfs4_deviceid_lock);
+
+       return new;
+}
+EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
+
+/*
+ * Dereference a deviceid node and delete it when its reference count drops
+ * to zero.
+ *
+ * @d deviceid node to put
+ *
+ * @ret true iff the node was deleted
+ */
+bool
+nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
+{
+       if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock))
+               return false;
+       hlist_del_init_rcu(&d->node);
+       spin_unlock(&nfs4_deviceid_lock);
+       synchronize_rcu();
+       d->ld->free_deviceid_node(d);
+       return true;
+}
+EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
+
+static void
+_deviceid_purge_client(const struct nfs_client *clp, long hash)
+{
+       struct nfs4_deviceid_node *d;
+       struct hlist_node *n, *next;
+       HLIST_HEAD(tmp);
+
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+               if (d->nfs_client == clp && atomic_read(&d->ref)) {
+                       hlist_del_init_rcu(&d->node);
+                       hlist_add_head(&d->node, &tmp);
+               }
+       rcu_read_unlock();
+
+       if (hlist_empty(&tmp))
+               return;
+
+       synchronize_rcu();
+       hlist_for_each_entry_safe(d, n, next, &tmp, node)
+               if (atomic_dec_and_test(&d->ref))
+                       d->ld->free_deviceid_node(d);
+}
+
+void
+nfs4_deviceid_purge_client(const struct nfs_client *clp)
+{
+       long h;
+
+       spin_lock(&nfs4_deviceid_lock);
+       for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
+               _deviceid_purge_client(clp, h);
+       spin_unlock(&nfs4_deviceid_lock);
+}
index 2bcf0dc..20a7f95 100644 (file)
@@ -288,7 +288,9 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
        atomic_set(&req->wb_complete, requests);
 
        BUG_ON(desc->pg_lseg != NULL);
-       lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL);
+       lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+                                 req_offset(req), desc->pg_count,
+                                 IOMODE_READ, GFP_KERNEL);
        ClearPageError(page);
        offset = 0;
        nbytes = desc->pg_count;
@@ -351,7 +353,9 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
        }
        req = nfs_list_entry(data->pages.next);
        if ((!lseg) && list_is_singular(&data->pages))
-               lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL);
+               lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+                                         req_offset(req), desc->pg_count,
+                                         IOMODE_READ, GFP_KERNEL);
 
        ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
                                0, lseg);
@@ -660,7 +664,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        if (ret == 0)
                goto read_complete; /* all pages were read */
 
-       pnfs_pageio_init_read(&pgio, inode);
        if (rsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
        else
index e288f06..ce40e5c 100644 (file)
@@ -63,6 +63,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY                NFSDBG_VFS
 
@@ -732,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 
        return 0;
 }
+#ifdef CONFIG_NFS_V4_1
+void show_sessions(struct seq_file *m, struct nfs_server *server)
+{
+       if (nfs4_has_session(server->nfs_client))
+               seq_printf(m, ",sessions");
+}
+#else
+void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
+
+#ifdef CONFIG_NFS_V4_1
+void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+       seq_printf(m, ",pnfs=");
+       if (server->pnfs_curr_ld)
+               seq_printf(m, "%s", server->pnfs_curr_ld->name);
+       else
+               seq_printf(m, "not configured");
+}
+#else  /* CONFIG_NFS_V4_1 */
+void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#endif /* CONFIG_NFS_V4_1 */
 
 static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
 {
@@ -792,6 +815,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
                seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
                seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
                seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+               show_sessions(m, nfss);
+               show_pnfs(m, nfss);
        }
 #endif
 
index 49c715b..e268e3b 100644 (file)
@@ -939,7 +939,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
        atomic_set(&req->wb_complete, requests);
 
        BUG_ON(desc->pg_lseg);
-       lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS);
+       lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+                                 req_offset(req), desc->pg_count,
+                                 IOMODE_RW, GFP_NOFS);
        ClearPageError(page);
        offset = 0;
        nbytes = desc->pg_count;
@@ -1013,7 +1015,9 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
        }
        req = nfs_list_entry(data->pages.next);
        if ((!lseg) && list_is_singular(&data->pages))
-               lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS);
+               lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+                                         req_offset(req), desc->pg_count,
+                                         IOMODE_RW, GFP_NOFS);
 
        if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
            (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
@@ -1032,8 +1036,6 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 {
        size_t wsize = NFS_SERVER(inode)->wsize;
 
-       pnfs_pageio_init_write(pgio, inode);
-
        if (wsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
        else
index 8e66c5c..504b289 100644 (file)
@@ -562,6 +562,7 @@ enum {
        NFSPROC4_CLNT_LAYOUTGET,
        NFSPROC4_CLNT_GETDEVICEINFO,
        NFSPROC4_CLNT_LAYOUTCOMMIT,
+       NFSPROC4_CLNT_LAYOUTRETURN,
 };
 
 /* nfs41 types */
index 91af2e4..3a34e80 100644 (file)
@@ -68,7 +68,7 @@ struct nfs_pageio_descriptor {
        int                     pg_ioflags;
        int                     pg_error;
        struct pnfs_layout_segment *pg_lseg;
-       int                     (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+       bool                    (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
 };
 
 #define NFS_WBACK_BUSY(req)    (test_bit(PG_BUSY,&(req)->wb_flags))
index 7e371f7..5e8444a 100644 (file)
@@ -269,6 +269,27 @@ struct nfs4_layoutcommit_data {
        struct nfs4_layoutcommit_res res;
 };
 
+struct nfs4_layoutreturn_args {
+       __u32   layout_type;
+       struct inode *inode;
+       nfs4_stateid stateid;
+       struct nfs4_sequence_args seq_args;
+};
+
+struct nfs4_layoutreturn_res {
+       struct nfs4_sequence_res seq_res;
+       u32 lrs_present;
+       nfs4_stateid stateid;
+};
+
+struct nfs4_layoutreturn {
+       struct nfs4_layoutreturn_args args;
+       struct nfs4_layoutreturn_res res;
+       struct rpc_cred *cred;
+       struct nfs_client *clp;
+       int rpc_status;
+};
+
 /*
  * Arguments to the open call.
  */
@@ -1087,6 +1108,7 @@ struct nfs_read_data {
        const struct rpc_call_ops *mds_ops;
        int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
        __u64                   mds_offset;
+       int                     pnfs_error;
        struct page             *page_array[NFS_PAGEVEC_SIZE];
 };
 
@@ -1112,6 +1134,7 @@ struct nfs_write_data {
        unsigned long           timestamp;      /* For lease renewal */
 #endif
        __u64                   mds_offset;     /* Filelayout dense stripe */
+       int                     pnfs_error;
        struct page             *page_array[NFS_PAGEVEC_SIZE];
 };
 
diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h
new file mode 100644 (file)
index 0000000..76efbdd
--- /dev/null
@@ -0,0 +1,345 @@
+/*
+ *  pNFS-osd on-the-wire data structures
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __PNFS_OSD_XDR_H__
+#define __PNFS_OSD_XDR_H__
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <scsi/osd_protocol.h>
+
+#define PNFS_OSD_OSDNAME_MAXSIZE 256
+
+/*
+ * draft-ietf-nfsv4-minorversion-22
+ * draft-ietf-nfsv4-pnfs-obj-12
+ */
+
+/* Layout Structure */
+
+enum pnfs_osd_raid_algorithm4 {
+       PNFS_OSD_RAID_0         = 1,
+       PNFS_OSD_RAID_4         = 2,
+       PNFS_OSD_RAID_5         = 3,
+       PNFS_OSD_RAID_PQ        = 4     /* Reed-Solomon P+Q */
+};
+
+/*   struct pnfs_osd_data_map4 {
+ *       uint32_t                    odm_num_comps;
+ *       length4                     odm_stripe_unit;
+ *       uint32_t                    odm_group_width;
+ *       uint32_t                    odm_group_depth;
+ *       uint32_t                    odm_mirror_cnt;
+ *       pnfs_osd_raid_algorithm4    odm_raid_algorithm;
+ *   };
+ */
+struct pnfs_osd_data_map {
+       u32     odm_num_comps;
+       u64     odm_stripe_unit;
+       u32     odm_group_width;
+       u32     odm_group_depth;
+       u32     odm_mirror_cnt;
+       u32     odm_raid_algorithm;
+};
+
+/*   struct pnfs_osd_objid4 {
+ *       deviceid4       oid_device_id;
+ *       uint64_t        oid_partition_id;
+ *       uint64_t        oid_object_id;
+ *   };
+ */
+struct pnfs_osd_objid {
+       struct nfs4_deviceid    oid_device_id;
+       u64                     oid_partition_id;
+       u64                     oid_object_id;
+};
+
+/* For printout. I use:
+ * kprint("dev(%llx:%llx)", _DEVID_LO(pointer), _DEVID_HI(pointer));
+ * BE style
+ */
+#define _DEVID_LO(oid_device_id) \
+       (unsigned long long)be64_to_cpup((__be64 *)(oid_device_id)->data)
+
+#define _DEVID_HI(oid_device_id) \
+       (unsigned long long)be64_to_cpup(((__be64 *)(oid_device_id)->data) + 1)
+
+static inline int
+pnfs_osd_objid_xdr_sz(void)
+{
+       return (NFS4_DEVICEID4_SIZE / 4) + 2 + 2;
+}
+
+enum pnfs_osd_version {
+       PNFS_OSD_MISSING              = 0,
+       PNFS_OSD_VERSION_1            = 1,
+       PNFS_OSD_VERSION_2            = 2
+};
+
+struct pnfs_osd_opaque_cred {
+       u32 cred_len;
+       void *cred;
+};
+
+enum pnfs_osd_cap_key_sec {
+       PNFS_OSD_CAP_KEY_SEC_NONE     = 0,
+       PNFS_OSD_CAP_KEY_SEC_SSV      = 1,
+};
+
+/*   struct pnfs_osd_object_cred4 {
+ *       pnfs_osd_objid4         oc_object_id;
+ *       pnfs_osd_version4       oc_osd_version;
+ *       pnfs_osd_cap_key_sec4   oc_cap_key_sec;
+ *       opaque                  oc_capability_key<>;
+ *       opaque                  oc_capability<>;
+ *   };
+ */
+struct pnfs_osd_object_cred {
+       struct pnfs_osd_objid           oc_object_id;
+       u32                             oc_osd_version;
+       u32                             oc_cap_key_sec;
+       struct pnfs_osd_opaque_cred     oc_cap_key;
+       struct pnfs_osd_opaque_cred     oc_cap;
+};
+
+/*   struct pnfs_osd_layout4 {
+ *       pnfs_osd_data_map4      olo_map;
+ *       uint32_t                olo_comps_index;
+ *       pnfs_osd_object_cred4   olo_components<>;
+ *   };
+ */
+struct pnfs_osd_layout {
+       struct pnfs_osd_data_map        olo_map;
+       u32                             olo_comps_index;
+       u32                             olo_num_comps;
+       struct pnfs_osd_object_cred     *olo_comps;
+};
+
+/* Device Address */
+enum pnfs_osd_targetid_type {
+       OBJ_TARGET_ANON = 1,
+       OBJ_TARGET_SCSI_NAME = 2,
+       OBJ_TARGET_SCSI_DEVICE_ID = 3,
+};
+
+/*   union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) {
+ *       case OBJ_TARGET_SCSI_NAME:
+ *           string              oti_scsi_name<>;
+ *
+ *       case OBJ_TARGET_SCSI_DEVICE_ID:
+ *           opaque              oti_scsi_device_id<>;
+ *
+ *       default:
+ *           void;
+ *   };
+ *
+ *   union pnfs_osd_targetaddr4 switch (bool ota_available) {
+ *       case TRUE:
+ *           netaddr4            ota_netaddr;
+ *       case FALSE:
+ *           void;
+ *   };
+ *
+ *   struct pnfs_osd_deviceaddr4 {
+ *       pnfs_osd_targetid4      oda_targetid;
+ *       pnfs_osd_targetaddr4    oda_targetaddr;
+ *       uint64_t                oda_lun;
+ *       opaque                  oda_systemid<>;
+ *       pnfs_osd_object_cred4   oda_root_obj_cred;
+ *       opaque                  oda_osdname<>;
+ *   };
+ */
+struct pnfs_osd_targetid {
+       u32                             oti_type;
+       struct nfs4_string              oti_scsi_device_id;
+};
+
+enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 };
+
+/*   struct netaddr4 {
+ *       // see struct rpcb in RFC1833
+ *       string r_netid<>;    // network id
+ *       string r_addr<>;     // universal address
+ *   };
+ */
+struct pnfs_osd_net_addr {
+       struct nfs4_string      r_netid;
+       struct nfs4_string      r_addr;
+};
+
+struct pnfs_osd_targetaddr {
+       u32                             ota_available;
+       struct pnfs_osd_net_addr        ota_netaddr;
+};
+
+enum {
+       NETWORK_ID_MAX = 16 / 4,
+       UNIVERSAL_ADDRESS_MAX = 64 / 4,
+       PNFS_OSD_TARGETADDR_MAX = 3 +  NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX,
+};
+
+struct pnfs_osd_deviceaddr {
+       struct pnfs_osd_targetid        oda_targetid;
+       struct pnfs_osd_targetaddr      oda_targetaddr;
+       u8                              oda_lun[8];
+       struct nfs4_string              oda_systemid;
+       struct pnfs_osd_object_cred     oda_root_obj_cred;
+       struct nfs4_string              oda_osdname;
+};
+
+enum {
+       ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4,
+       PNFS_OSD_DEVICEADDR_MAX =
+               PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX +
+               2 /*oda_lun*/ +
+               1 + OSD_SYSTEMID_LEN +
+               1 + ODA_OSDNAME_MAX,
+};
+
+/* LAYOUTCOMMIT: layoutupdate */
+
+/*   union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) {
+ *       case TRUE:
+ *           int64_t     dsu_delta;
+ *       case FALSE:
+ *           void;
+ *   };
+ *
+ *   struct pnfs_osd_layoutupdate4 {
+ *       pnfs_osd_deltaspaceused4    olu_delta_space_used;
+ *       bool                        olu_ioerr_flag;
+ *   };
+ */
+struct pnfs_osd_layoutupdate {
+       u32     dsu_valid;
+       s64     dsu_delta;
+       u32     olu_ioerr_flag;
+};
+
+/* LAYOUTRETURN: I/O Rrror Report */
+
+enum pnfs_osd_errno {
+       PNFS_OSD_ERR_EIO                = 1,
+       PNFS_OSD_ERR_NOT_FOUND          = 2,
+       PNFS_OSD_ERR_NO_SPACE           = 3,
+       PNFS_OSD_ERR_BAD_CRED           = 4,
+       PNFS_OSD_ERR_NO_ACCESS          = 5,
+       PNFS_OSD_ERR_UNREACHABLE        = 6,
+       PNFS_OSD_ERR_RESOURCE           = 7
+};
+
+/*   struct pnfs_osd_ioerr4 {
+ *       pnfs_osd_objid4     oer_component;
+ *       length4             oer_comp_offset;
+ *       length4             oer_comp_length;
+ *       bool                oer_iswrite;
+ *       pnfs_osd_errno4     oer_errno;
+ *   };
+ */
+struct pnfs_osd_ioerr {
+       struct pnfs_osd_objid   oer_component;
+       u64                     oer_comp_offset;
+       u64                     oer_comp_length;
+       u32                     oer_iswrite;
+       u32                     oer_errno;
+};
+
+/* OSD XDR API */
+/* Layout helpers */
+/* Layout decoding is done in two parts:
+ * 1. First Call pnfs_osd_xdr_decode_layout_map to read in only the header part
+ *    of the layout. @iter members need not be initialized.
+ *    Returned:
+ *             @layout members are set. (@layout->olo_comps set to NULL).
+ *
+ *             Zero on success, or negative error if passed xdr is broken.
+ *
+ * 2. 2nd Call pnfs_osd_xdr_decode_layout_comp() in a loop until it returns
+ *    false, to decode the next component.
+ *    Returned:
+ *       true if there is more to decode or false if we are done or error.
+ *
+ * Example:
+ *     struct pnfs_osd_xdr_decode_layout_iter iter;
+ *     struct pnfs_osd_layout layout;
+ *     struct pnfs_osd_object_cred comp;
+ *     int status;
+ *
+ *     status = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
+ *     if (unlikely(status))
+ *             goto err;
+ *     while(pnfs_osd_xdr_decode_layout_comp(&comp, &iter, xdr, &status)) {
+ *             // All of @comp strings point to inside the xdr_buffer
+ *             // or scrach buffer. Copy them out to user memory eg.
+ *             copy_single_comp(dest_comp++, &comp);
+ *     }
+ *     if (unlikely(status))
+ *             goto err;
+ */
+
+struct pnfs_osd_xdr_decode_layout_iter {
+       unsigned total_comps;
+       unsigned decoded_comps;
+};
+
+extern int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
+       struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr);
+
+extern bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
+       struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
+       int *err);
+
+/* Device Info helpers */
+
+/* Note: All strings inside @deviceaddr point to space inside @p.
+ * @p should stay valid while @deviceaddr is in use.
+ */
+extern void pnfs_osd_xdr_decode_deviceaddr(
+       struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p);
+
+/* layoutupdate (layout_commit) xdr helpers */
+extern int
+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
+                                struct pnfs_osd_layoutupdate *lou);
+
+/* osd_ioerror encoding/decoding (layout_return) */
+/* Client */
+extern __be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr);
+extern void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr);
+
+#endif /* __PNFS_OSD_XDR_H__ */
index fc84b7a..a20970e 100644 (file)
@@ -216,6 +216,8 @@ extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
 extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
                unsigned int base, unsigned int len);
 extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
+extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
+               struct page **pages, unsigned int len);
 extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen);
 extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
index 679cd67..f008c14 100644 (file)
@@ -638,6 +638,25 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
 }
 EXPORT_SYMBOL_GPL(xdr_init_decode);
 
+/**
+ * xdr_init_decode - Initialize an xdr_stream for decoding data.
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to XDR buffer from which to decode data
+ * @pages: list of pages to decode into
+ * @len: length in bytes of buffer in pages
+ */
+void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
+                          struct page **pages, unsigned int len)
+{
+       memset(buf, 0, sizeof(*buf));
+       buf->pages =  pages;
+       buf->page_len =  len;
+       buf->buflen =  len;
+       buf->len = len;
+       xdr_init_decode(xdr, buf, NULL);
+}
+EXPORT_SYMBOL_GPL(xdr_init_decode_pages);
+
 static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
 {
        __be32 *p = xdr->p;